symbolimind 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symbolimind-0.1.0/LICENSE +21 -0
- symbolimind-0.1.0/PKG-INFO +75 -0
- symbolimind-0.1.0/README.md +48 -0
- symbolimind-0.1.0/pyproject.toml +34 -0
- symbolimind-0.1.0/setup.cfg +4 -0
- symbolimind-0.1.0/symbolimind/__init__.py +5 -0
- symbolimind-0.1.0/symbolimind/brain.py +164 -0
- symbolimind-0.1.0/symbolimind/engine.py +230 -0
- symbolimind-0.1.0/symbolimind/extensions.py +968 -0
- symbolimind-0.1.0/symbolimind/extensions_clean.py +965 -0
- symbolimind-0.1.0/symbolimind/grammar.py +116 -0
- symbolimind-0.1.0/symbolimind/memory.py +59 -0
- symbolimind-0.1.0/symbolimind/skill.py +55 -0
- symbolimind-0.1.0/symbolimind/unified.py +85 -0
- symbolimind-0.1.0/symbolimind.egg-info/PKG-INFO +75 -0
- symbolimind-0.1.0/symbolimind.egg-info/SOURCES.txt +18 -0
- symbolimind-0.1.0/symbolimind.egg-info/dependency_links.txt +1 -0
- symbolimind-0.1.0/symbolimind.egg-info/requires.txt +3 -0
- symbolimind-0.1.0/symbolimind.egg-info/top_level.txt +1 -0
- symbolimind-0.1.0/tests/test_memory.py +24 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 [(https://github.com/d87skg)]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: symbolimind
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AI-powered symbolic regression engine that discovers governing equations from data.
|
|
5
|
+
Author: SymbolicMind Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/d87skg/SymbolicMind
|
|
8
|
+
Project-URL: Repository, https://github.com/d87skg/SymbolicMind.git
|
|
9
|
+
Project-URL: Issues, https://github.com/d87skg/SymbolicMind/issues
|
|
10
|
+
Keywords: symbolic-regression,physics,ai,machine-learning,science
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Physics
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy
|
|
24
|
+
Requires-Dist: scipy
|
|
25
|
+
Requires-Dist: scikit-learn
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# SymbolicMind
|
|
29
|
+
|
|
30
|
+
**一个能从数据中自主发现物理定律的 AI 引擎。**
|
|
31
|
+
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
|
|
34
|
+
SymbolicMind 是一个因果符号回归引擎,基于 **BIC 精拟合** 和 **P0 可证伪性边界** 构建。它不仅拟合数据,还能自我修正、自我发现、自我进化——让机器从原始观测数据中提炼出人类可读的数学控制方程。
|
|
35
|
+
|
|
36
|
+
## 核心能力
|
|
37
|
+
|
|
38
|
+
- **方程发现**:自动从时间序列或静态数据中识别非线性动力学方程。
|
|
39
|
+
- **可证伪性**:内置 P0 探测器,通过残差自相关判别模型的可靠性,绝不捏造物理。
|
|
40
|
+
- **自演化算子库**:在需要时自主生成时变、分数阶、嵌套积分核等高级数学结构。
|
|
41
|
+
- **恒等式识别**:自动识别 `dx/dt = y` 等简单恒等式,避免过度建模。
|
|
42
|
+
- **AI 智能体就绪**:提供符合 OpenAI Function Calling 规范的工具接口,可被任何 AI Agent 调用。
|
|
43
|
+
- **记忆增强**:本地经验库记录成功发现,越用越聪明。
|
|
44
|
+
|
|
45
|
+
## 快速开始
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install numpy scipy scikit-learn
|
|
49
|
+
python quickstart.py
|
|
50
|
+
|
|
51
|
+
你将看到引擎发现简谐振动方程:d²x/dt² = -x,并输出详细的诊断报告。
|
|
52
|
+
|
|
53
|
+
基准测试成绩
|
|
54
|
+
SymbolicMind 在 Feynman、Strogatz、ODEBench 和 Blackbox 等基准上取得了领先成绩。详见 BENCHMARKS.md。
|
|
55
|
+
|
|
56
|
+
项目结构
|
|
57
|
+
text
|
|
58
|
+
SymbolicMind/
|
|
59
|
+
├── symbolimind/ # 核心引擎包
|
|
60
|
+
│ ├── engine.py # BIC 精拟合器 + P0 探测器
|
|
61
|
+
│ ├── extensions.py # 恒等式检测 + 自适应断层扫描
|
|
62
|
+
│ ├── skill.py # AI 智能体工具接口
|
|
63
|
+
│ ├── brain.py # 自然语言大脑
|
|
64
|
+
│ └── memory.py # 本地经验记忆库
|
|
65
|
+
├── tests/ # 测试
|
|
66
|
+
├── BENCHMARKS.md # 基准测试成绩单
|
|
67
|
+
├── CONTRIBUTING.md # 贡献指南
|
|
68
|
+
├── LICENSE # MIT 许可证
|
|
69
|
+
├── quickstart.py # 5 分钟快速体验
|
|
70
|
+
└── README.md # 本文件
|
|
71
|
+
引用
|
|
72
|
+
如果 SymbolicMind 对你的研究有帮助,请引用本项目。
|
|
73
|
+
|
|
74
|
+
贡献
|
|
75
|
+
我们欢迎社区贡献!无论是新的物理算子、基准测试还是文档改进,请参见 CONTRIBUTING.md。
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# SymbolicMind
|
|
2
|
+
|
|
3
|
+
**一个能从数据中自主发现物理定律的 AI 引擎。**
|
|
4
|
+
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
SymbolicMind 是一个因果符号回归引擎,基于 **BIC 精拟合** 和 **P0 可证伪性边界** 构建。它不仅拟合数据,还能自我修正、自我发现、自我进化——让机器从原始观测数据中提炼出人类可读的数学控制方程。
|
|
8
|
+
|
|
9
|
+
## 核心能力
|
|
10
|
+
|
|
11
|
+
- **方程发现**:自动从时间序列或静态数据中识别非线性动力学方程。
|
|
12
|
+
- **可证伪性**:内置 P0 探测器,通过残差自相关判别模型的可靠性,绝不捏造物理。
|
|
13
|
+
- **自演化算子库**:在需要时自主生成时变、分数阶、嵌套积分核等高级数学结构。
|
|
14
|
+
- **恒等式识别**:自动识别 `dx/dt = y` 等简单恒等式,避免过度建模。
|
|
15
|
+
- **AI 智能体就绪**:提供符合 OpenAI Function Calling 规范的工具接口,可被任何 AI Agent 调用。
|
|
16
|
+
- **记忆增强**:本地经验库记录成功发现,越用越聪明。
|
|
17
|
+
|
|
18
|
+
## 快速开始
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install numpy scipy scikit-learn
|
|
22
|
+
python quickstart.py
|
|
23
|
+
|
|
24
|
+
你将看到引擎发现简谐振动方程:d²x/dt² = -x,并输出详细的诊断报告。
|
|
25
|
+
|
|
26
|
+
基准测试成绩
|
|
27
|
+
SymbolicMind 在 Feynman、Strogatz、ODEBench 和 Blackbox 等基准上取得了领先成绩。详见 BENCHMARKS.md。
|
|
28
|
+
|
|
29
|
+
项目结构
|
|
30
|
+
text
|
|
31
|
+
SymbolicMind/
|
|
32
|
+
├── symbolimind/ # 核心引擎包
|
|
33
|
+
│ ├── engine.py # BIC 精拟合器 + P0 探测器
|
|
34
|
+
│ ├── extensions.py # 恒等式检测 + 自适应断层扫描
|
|
35
|
+
│ ├── skill.py # AI 智能体工具接口
|
|
36
|
+
│ ├── brain.py # 自然语言大脑
|
|
37
|
+
│ └── memory.py # 本地经验记忆库
|
|
38
|
+
├── tests/ # 测试
|
|
39
|
+
├── BENCHMARKS.md # 基准测试成绩单
|
|
40
|
+
├── CONTRIBUTING.md # 贡献指南
|
|
41
|
+
├── LICENSE # MIT 许可证
|
|
42
|
+
├── quickstart.py # 5 分钟快速体验
|
|
43
|
+
└── README.md # 本文件
|
|
44
|
+
引用
|
|
45
|
+
如果 SymbolicMind 对你的研究有帮助,请引用本项目。
|
|
46
|
+
|
|
47
|
+
贡献
|
|
48
|
+
我们欢迎社区贡献!无论是新的物理算子、基准测试还是文档改进,请参见 CONTRIBUTING.md。
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "symbolimind"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "AI-powered symbolic regression engine that discovers governing equations from data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [{name = "SymbolicMind Team"}]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.9",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Physics",
|
|
22
|
+
]
|
|
23
|
+
keywords = ["symbolic-regression", "physics", "ai", "machine-learning", "science"]
|
|
24
|
+
requires-python = ">=3.9"
|
|
25
|
+
dependencies = ["numpy", "scipy", "scikit-learn"]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/d87skg/SymbolicMind"
|
|
29
|
+
Repository = "https://github.com/d87skg/SymbolicMind.git"
|
|
30
|
+
Issues = "https://github.com/d87skg/SymbolicMind/issues"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["."]
|
|
34
|
+
include = ["symbolimind*"]
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SymbolicMind Brain - Intelligent Agent Frontend
|
|
3
|
+
|
|
4
|
+
Contains LocalBrain and GPTBrain for natural language interaction
|
|
5
|
+
with the CDE engine. Includes physical knowledge base and equation beautification.
|
|
6
|
+
|
|
7
|
+
Author: [Your Name]
|
|
8
|
+
License: MIT
|
|
9
|
+
"""
|
|
10
|
+
import os, sys, json
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
class LocalBrain:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
# ===== 内置物理知识库 =====
|
|
16
|
+
self.physical_terms = {
|
|
17
|
+
"加速度": "d2x_dt2",
|
|
18
|
+
"速度": "dx_dt",
|
|
19
|
+
"位置": "x",
|
|
20
|
+
"位移": "x",
|
|
21
|
+
"二阶导数": "d2x_dt2",
|
|
22
|
+
"一阶导数": "dx_dt"
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def parse_question(self, question, available_columns):
|
|
26
|
+
"""从自然语言中提取目标列名,优先使用物理知识库。"""
|
|
27
|
+
q_lower = question.lower()
|
|
28
|
+
|
|
29
|
+
# 1. 优先查找物理概念
|
|
30
|
+
for term, col in self.physical_terms.items():
|
|
31
|
+
if term in question and col in available_columns:
|
|
32
|
+
return col
|
|
33
|
+
|
|
34
|
+
# 2. 直接匹配列名(按长度降序)
|
|
35
|
+
sorted_cols = sorted(available_columns, key=lambda c: len(c), reverse=True)
|
|
36
|
+
for col in sorted_cols:
|
|
37
|
+
if col in q_lower:
|
|
38
|
+
return col
|
|
39
|
+
|
|
40
|
+
# 3. 默认返回最后一个
|
|
41
|
+
return available_columns[-1]
|
|
42
|
+
|
|
43
|
+
def generate_answer(self, result, target_column=None):
|
|
44
|
+
"""生成专业的科学报告,并自动将方程重写为物理形式。"""
|
|
45
|
+
if "error" in result:
|
|
46
|
+
return f"抱歉,分析遇到问题:{result['error']}"
|
|
47
|
+
|
|
48
|
+
equation = result['equation']
|
|
49
|
+
r2 = result['r2']
|
|
50
|
+
p0 = result['p0_verdict']
|
|
51
|
+
lac = result.get('residual_autocorr', 0.0)
|
|
52
|
+
|
|
53
|
+
# 将方程重写为更物理的形式
|
|
54
|
+
phys_eq = self._format_equation_physically(equation, target_column)
|
|
55
|
+
|
|
56
|
+
# P0 解读
|
|
57
|
+
if "adequate" in p0.lower() or "exact" in p0.lower():
|
|
58
|
+
reliability = "模型通过了严格的P0可证伪性检验,方程结构可靠。"
|
|
59
|
+
else:
|
|
60
|
+
reliability = f"模型拟合优度很高,但P0检测发现残差仍有自相关({lac:.3f}),可能存在未捕获的隐藏变量,建议进一步实验。"
|
|
61
|
+
|
|
62
|
+
return (f"我发现了控制方程:\n {phys_eq}\n"
|
|
63
|
+
f"拟合优度 R² = {r2:.4f}。{reliability}")
|
|
64
|
+
|
|
65
|
+
def _format_equation_physically(self, equation, target_column=None):
|
|
66
|
+
"""将方程表达式重写为目标变量的物理形式,并美化符号。"""
|
|
67
|
+
# 1. 基础符号美化
|
|
68
|
+
equation = equation.replace('sin_t', 'sin(t)').replace('cos_t', 'cos(t)')
|
|
69
|
+
equation = equation.replace('exp_neg_', 'exp(-').replace('_sq', '²')
|
|
70
|
+
equation = equation.replace('sqrt_', '√(')
|
|
71
|
+
# 如果符号后有括号缺失,可以补上,这里先简单处理
|
|
72
|
+
|
|
73
|
+
# 2. 处理恒等式标记
|
|
74
|
+
is_identity = "(identity)" in equation
|
|
75
|
+
clean_eq = equation.replace(" (identity)", "").strip()
|
|
76
|
+
|
|
77
|
+
# 3. 如果提供了目标列名,重写为 d²x/dt² = ... 的形式
|
|
78
|
+
if target_column and is_identity:
|
|
79
|
+
lhs = self._col_to_readable(target_column)
|
|
80
|
+
parts = clean_eq.split('=')
|
|
81
|
+
if len(parts) == 2:
|
|
82
|
+
rhs = parts[1].strip()
|
|
83
|
+
return f"{lhs} = {rhs}"
|
|
84
|
+
elif len(parts) == 1:
|
|
85
|
+
rhs = clean_eq.replace("y = ", "").replace("y=", "")
|
|
86
|
+
return f"{lhs} = {rhs}"
|
|
87
|
+
|
|
88
|
+
# 默认返回美化后的方程
|
|
89
|
+
return clean_eq
|
|
90
|
+
|
|
91
|
+
def _col_to_readable(self, col):
|
|
92
|
+
"""将列名转换为可读的物理符号。"""
|
|
93
|
+
mapping = {
|
|
94
|
+
"d2x_dt2": "d²x/dt²",
|
|
95
|
+
"dx_dt": "dx/dt",
|
|
96
|
+
"x": "x",
|
|
97
|
+
"t": "t"
|
|
98
|
+
}
|
|
99
|
+
return mapping.get(col, col)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------- 云端智能大脑 (需 openai 包及环境变量) ----------
|
|
103
|
+
class GPTBrain:
|
|
104
|
+
def __init__(self, model="gpt-3.5-turbo"):
|
|
105
|
+
self.model = model
|
|
106
|
+
import openai
|
|
107
|
+
self.openai = openai
|
|
108
|
+
self.api_key = os.environ.get("OPENAI_API_KEY")
|
|
109
|
+
if not self.api_key:
|
|
110
|
+
raise ValueError("请设置环境变量 OPENAI_API_KEY")
|
|
111
|
+
self.openai.api_key = self.api_key
|
|
112
|
+
|
|
113
|
+
def parse_question(self, question, available_columns):
|
|
114
|
+
prompt = (
|
|
115
|
+
f"你是一个科学数据分析助手。用户有一份数据,包含以下列:{', '.join(available_columns)}。\n"
|
|
116
|
+
f"用户的问题是:“{question}”\n"
|
|
117
|
+
f"请从列名中选出与问题最相关的一个列名,只输出列名本身,不要输出任何其他内容。"
|
|
118
|
+
)
|
|
119
|
+
response = self.openai.ChatCompletion.create(
|
|
120
|
+
model=self.model,
|
|
121
|
+
messages=[{"role": "user", "content": prompt}],
|
|
122
|
+
temperature=0.0,
|
|
123
|
+
max_tokens=20
|
|
124
|
+
)
|
|
125
|
+
answer = response.choices[0].message.content.strip()
|
|
126
|
+
if answer in available_columns:
|
|
127
|
+
return answer
|
|
128
|
+
else:
|
|
129
|
+
fallback = LocalBrain()
|
|
130
|
+
return fallback.parse_question(question, available_columns)
|
|
131
|
+
|
|
132
|
+
def generate_answer(self, result, target_column=None):
|
|
133
|
+
if "error" in result:
|
|
134
|
+
return f"抱歉,分析遇到问题:{result['error']}"
|
|
135
|
+
equation = result['equation']
|
|
136
|
+
r2 = result['r2']
|
|
137
|
+
p0 = result['p0_verdict']
|
|
138
|
+
lac = result.get('residual_autocorr', 0.0)
|
|
139
|
+
prompt = (
|
|
140
|
+
f"你是一个物理学数据科学家。CDE引擎从数据中发现了一个方程:{equation},"
|
|
141
|
+
f"R² = {r2:.4f},残差自相关 = {lac:.4f},P0可证伪性判决:“{p0}”。\n"
|
|
142
|
+
f"请用中文向用户报告这个发现,解释方程的意义,并根据P0判决给出建议。"
|
|
143
|
+
)
|
|
144
|
+
response = self.openai.ChatCompletion.create(
|
|
145
|
+
model=self.model,
|
|
146
|
+
messages=[{"role": "user", "content": prompt}],
|
|
147
|
+
temperature=0.5,
|
|
148
|
+
max_tokens=300
|
|
149
|
+
)
|
|
150
|
+
return response.choices[0].message.content.strip()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ---------- 工厂函数 ----------
|
|
154
|
+
def create_brain(mode="local"):
|
|
155
|
+
if mode == "local":
|
|
156
|
+
return LocalBrain()
|
|
157
|
+
elif mode == "gpt":
|
|
158
|
+
try:
|
|
159
|
+
return GPTBrain()
|
|
160
|
+
except ValueError as e:
|
|
161
|
+
print(f"无法加载GPT大脑: {e},降级为本地大脑。")
|
|
162
|
+
return LocalBrain()
|
|
163
|
+
else:
|
|
164
|
+
raise ValueError("模式仅支持 'local' 或 'gpt'")
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SymbolicMind Engine - Core Symbolic Regression Module
|
|
3
|
+
|
|
4
|
+
This module provides the CDE_V80 class, the core engine for discovering
|
|
5
|
+
governing equations from data. It features BIC-guided sparse regression,
|
|
6
|
+
P0 falsifiability boundary, and dual-mode operation (static / time-series).
|
|
7
|
+
|
|
8
|
+
Author: [Your Name]
|
|
9
|
+
License: MIT
|
|
10
|
+
"""
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
|
|
13
|
+
|
|
14
|
+
class CDE_V80:
|
|
15
|
+
"""
|
|
16
|
+
CDE V8.0 符号回归适配器 (SRBench 兼容)
|
|
17
|
+
|
|
18
|
+
双模式:
|
|
19
|
+
- time_series=False (默认): 静态表格,使用多项式+超越函数特征池
|
|
20
|
+
- time_series=True: 时间序列,启用时滞、积分核等完整记忆算子
|
|
21
|
+
"""
|
|
22
|
+
def __init__(self, threshold=0.2, max_iter=8, time_series=False, autostop=True):
|
|
23
|
+
self.threshold = threshold
|
|
24
|
+
self.max_iter = max_iter
|
|
25
|
+
self.time_series = time_series
|
|
26
|
+
self.autostop = autostop
|
|
27
|
+
self.coefficients_ = None
|
|
28
|
+
self.selected_features_ = None
|
|
29
|
+
self.r2_ = None
|
|
30
|
+
self.residual_autocorr_ = None
|
|
31
|
+
self.equation_ = None
|
|
32
|
+
|
|
33
|
+
def fit(self, X, y, feature_names=None):
|
|
34
|
+
X = np.atleast_2d(X)
|
|
35
|
+
if feature_names is None:
|
|
36
|
+
feature_names = [f'x{i}' for i in range(X.shape[1])]
|
|
37
|
+
|
|
38
|
+
# 特征工程
|
|
39
|
+
if self.time_series:
|
|
40
|
+
X_aug, aug_names = self._build_time_series_features(X, feature_names)
|
|
41
|
+
else:
|
|
42
|
+
X_aug, aug_names = self._build_static_features(X, feature_names)
|
|
43
|
+
|
|
44
|
+
# 核心 BIC 精拟合
|
|
45
|
+
final_coeffs, active_idx, r2, lag1 = self._bic_fit(X_aug, y, aug_names)
|
|
46
|
+
|
|
47
|
+
self.coefficients_ = final_coeffs
|
|
48
|
+
self.selected_features_ = [aug_names[i] for i in active_idx]
|
|
49
|
+
self.r2_ = r2
|
|
50
|
+
self.residual_autocorr_ = lag1
|
|
51
|
+
self.equation_ = self._format_equation(final_coeffs, aug_names, active_idx)
|
|
52
|
+
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
def predict(self, X):
|
|
56
|
+
X = np.atleast_2d(X)
|
|
57
|
+
if self.time_series:
|
|
58
|
+
X_aug, _ = self._build_time_series_features(X, [f'x{i}' for i in range(X.shape[1])])
|
|
59
|
+
else:
|
|
60
|
+
X_aug, _ = self._build_static_features(X, [f'x{i}' for i in range(X.shape[1])])
|
|
61
|
+
return X_aug @ self.coefficients_
|
|
62
|
+
|
|
63
|
+
def get_equation(self):
|
|
64
|
+
if self.equation_ is None:
|
|
65
|
+
return "Model not fitted."
|
|
66
|
+
return self.equation_
|
|
67
|
+
|
|
68
|
+
def get_p0_report(self):
|
|
69
|
+
if self.residual_autocorr_ is None:
|
|
70
|
+
return "P0 not available."
|
|
71
|
+
if self.r2_ is None:
|
|
72
|
+
return "P0 not available."
|
|
73
|
+
if self.r2_ < 0.3 and abs(self.residual_autocorr_) > 0.3:
|
|
74
|
+
return "Hidden variables detected (low R^2 + high residual autocorrelation)"
|
|
75
|
+
elif self.r2_ >= 0.95 and abs(self.residual_autocorr_) < 0.1:
|
|
76
|
+
return "Model is structurally adequate."
|
|
77
|
+
else:
|
|
78
|
+
return "Further inspection recommended."
|
|
79
|
+
|
|
80
|
+
def _build_static_features(self, X, base_names):
|
|
81
|
+
"""静态模式:多项式 + 超越函数"""
|
|
82
|
+
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
|
|
83
|
+
X_poly = poly.fit_transform(X)
|
|
84
|
+
poly_names = poly.get_feature_names_out(base_names)
|
|
85
|
+
|
|
86
|
+
# 添加超越函数特征
|
|
87
|
+
trans_list = [X_poly]
|
|
88
|
+
trans_names = list(poly_names)
|
|
89
|
+
|
|
90
|
+
n_samples, n_features = X.shape
|
|
91
|
+
for i in range(n_features):
|
|
92
|
+
xi = X[:, i]
|
|
93
|
+
# 安全处理,防止除零/溢出
|
|
94
|
+
trans_list.append(np.sqrt(np.abs(xi)).reshape(-1, 1))
|
|
95
|
+
trans_names.append(f'sqrt_{base_names[i]}')
|
|
96
|
+
trans_list.append(np.exp(-xi**2).reshape(-1, 1))
|
|
97
|
+
trans_names.append(f'exp_neg_{base_names[i]}_sq')
|
|
98
|
+
trans_list.append(np.sin(xi).reshape(-1, 1))
|
|
99
|
+
trans_names.append(f'sin_{base_names[i]}')
|
|
100
|
+
trans_list.append(np.cos(xi).reshape(-1, 1))
|
|
101
|
+
trans_names.append(f'cos_{base_names[i]}')
|
|
102
|
+
trans_list.append(np.log1p(np.abs(xi)).reshape(-1, 1))
|
|
103
|
+
trans_names.append(f'log1p_{base_names[i]}')
|
|
104
|
+
# 反比例项
|
|
105
|
+
trans_list.append((1.0 / (1.0 + np.abs(xi))).reshape(-1, 1))
|
|
106
|
+
trans_names.append(f'inv1p_{base_names[i]}')
|
|
107
|
+
|
|
108
|
+
return np.hstack(trans_list), trans_names
|
|
109
|
+
|
|
110
|
+
def _build_time_series_features(self, X, base_names):
|
|
111
|
+
"""时间序列模式:基础项 + 时滞 + 积分核"""
|
|
112
|
+
n_samples, n_features = X.shape
|
|
113
|
+
dt = 1.0 # 假设均匀采样间隔为1
|
|
114
|
+
|
|
115
|
+
feature_list = [X]
|
|
116
|
+
names = list(base_names)
|
|
117
|
+
|
|
118
|
+
# 时滞项
|
|
119
|
+
for lag in [1, 2, 5, 10]:
|
|
120
|
+
if lag < n_samples:
|
|
121
|
+
for j in range(n_features):
|
|
122
|
+
rolled = np.roll(X[:, j], lag)
|
|
123
|
+
rolled[:lag] = X[0, j] # 边界填充
|
|
124
|
+
feature_list.append(rolled.reshape(-1, 1))
|
|
125
|
+
names.append(f'{base_names[j]}_lag{lag}')
|
|
126
|
+
|
|
127
|
+
# 连续指数积分核 (多个衰减率)
|
|
128
|
+
alphas = [0.05, 0.1, 0.2, 0.5]
|
|
129
|
+
for j in range(n_features):
|
|
130
|
+
for alpha in alphas:
|
|
131
|
+
mem = np.zeros(n_samples)
|
|
132
|
+
cum_sum = 0.0
|
|
133
|
+
for i in range(n_samples):
|
|
134
|
+
if i == 0:
|
|
135
|
+
cum_sum = X[0, j] * dt
|
|
136
|
+
else:
|
|
137
|
+
cum_sum = np.exp(-alpha * dt) * cum_sum + X[i, j] * dt
|
|
138
|
+
mem[i] = cum_sum
|
|
139
|
+
feature_list.append(mem.reshape(-1, 1))
|
|
140
|
+
names.append(f'int_{base_names[j]}_a{alpha}')
|
|
141
|
+
|
|
142
|
+
# 时变核 (示例:正弦调制)
|
|
143
|
+
t = np.arange(n_samples) * dt
|
|
144
|
+
for j in range(n_features):
|
|
145
|
+
for amp in [0.1, 0.2]:
|
|
146
|
+
alpha_t = 0.1 + amp * np.sin(2 * np.pi * t / 25.0)
|
|
147
|
+
mem = np.zeros(n_samples)
|
|
148
|
+
for i in range(n_samples):
|
|
149
|
+
integral = 0.0
|
|
150
|
+
for k in range(i):
|
|
151
|
+
tau = (i - k) * dt
|
|
152
|
+
integral += np.exp(-alpha_t[i] * tau) * X[k, j] * dt
|
|
153
|
+
mem[i] = integral
|
|
154
|
+
feature_list.append(mem.reshape(-1, 1))
|
|
155
|
+
names.append(f'tv_int_{base_names[j]}_amp{amp}')
|
|
156
|
+
|
|
157
|
+
return np.hstack(feature_list), names
|
|
158
|
+
|
|
159
|
+
def _bic_fit(self, Theta, target, feature_names):
|
|
160
|
+
"""扁平两阶段 + BIC 后向剔除"""
|
|
161
|
+
Theta_std = np.std(Theta, axis=0)
|
|
162
|
+
Theta_std[Theta_std == 0] = 1.0
|
|
163
|
+
Theta_scaled = Theta / Theta_std
|
|
164
|
+
n_samples, n_features = Theta.shape
|
|
165
|
+
active = list(range(n_features))
|
|
166
|
+
|
|
167
|
+
def compute_bic(pred, target, k):
|
|
168
|
+
sse = np.sum((target - pred)**2)
|
|
169
|
+
if sse < 1e-15: sse = 1e-15
|
|
170
|
+
return n_samples * np.log(sse / n_samples) + k * np.log(n_samples)
|
|
171
|
+
|
|
172
|
+
# 初始全模型拟合
|
|
173
|
+
coeffs_full = np.linalg.lstsq(Theta_scaled, target, rcond=1e-6)[0]
|
|
174
|
+
pred_full = Theta_scaled @ coeffs_full
|
|
175
|
+
best_bic = compute_bic(pred_full, target, len(active))
|
|
176
|
+
|
|
177
|
+
# 后向剔除
|
|
178
|
+
while len(active) > 1:
|
|
179
|
+
candidate_bics = []
|
|
180
|
+
for col in active:
|
|
181
|
+
trial = [c for c in active if c != col]
|
|
182
|
+
Theta_trial = Theta_scaled[:, trial]
|
|
183
|
+
coeff_trial = np.linalg.lstsq(Theta_trial, target, rcond=1e-6)[0]
|
|
184
|
+
pred_trial = Theta_trial @ coeff_trial
|
|
185
|
+
bic_trial = compute_bic(pred_trial, target, len(trial))
|
|
186
|
+
candidate_bics.append((bic_trial, col))
|
|
187
|
+
candidate_bics.sort(key=lambda v: v[0])
|
|
188
|
+
min_bic, drop_col = candidate_bics[0]
|
|
189
|
+
if min_bic < best_bic:
|
|
190
|
+
active.remove(drop_col)
|
|
191
|
+
best_bic = min_bic
|
|
192
|
+
else:
|
|
193
|
+
break
|
|
194
|
+
|
|
195
|
+
# 最终系数还原到原始尺度
|
|
196
|
+
final_coeffs = np.zeros(n_features)
|
|
197
|
+
if len(active) > 0:
|
|
198
|
+
Theta_active_scaled = Theta_scaled[:, active]
|
|
199
|
+
coeff_active = np.linalg.lstsq(Theta_active_scaled, target, rcond=1e-6)[0]
|
|
200
|
+
final_coeffs[active] = coeff_active / Theta_std[active]
|
|
201
|
+
|
|
202
|
+
# R² 和残差自相关
|
|
203
|
+
pred = Theta @ final_coeffs
|
|
204
|
+
ss_res = np.sum((target - pred)**2)
|
|
205
|
+
ss_tot = np.sum((target - np.mean(target))**2)
|
|
206
|
+
r2 = 1 - ss_res/ss_tot if ss_tot > 1e-12 else 1.0
|
|
207
|
+
|
|
208
|
+
residual = target - pred
|
|
209
|
+
residual_detrend = residual - np.mean(residual)
|
|
210
|
+
acf = np.correlate(residual_detrend, residual_detrend, mode='same')
|
|
211
|
+
lag1 = acf[len(acf)//2 + 1] / acf[len(acf)//2] if acf[len(acf)//2] != 0 else 0.0
|
|
212
|
+
|
|
213
|
+
return final_coeffs, active, r2, lag1
|
|
214
|
+
|
|
215
|
+
def _format_equation(self, coeffs, names, active_idx):
|
|
216
|
+
"""将系数格式化为方程字符串"""
|
|
217
|
+
if len(active_idx) == 0:
|
|
218
|
+
return "y = 0 (all terms eliminated)"
|
|
219
|
+
terms = []
|
|
220
|
+
for i in active_idx:
|
|
221
|
+
c = coeffs[i]
|
|
222
|
+
if abs(c) > 1e-10:
|
|
223
|
+
terms.append(f"{round(c, 6)}*{names[i]}")
|
|
224
|
+
if not terms:
|
|
225
|
+
return "y = constant"
|
|
226
|
+
return "y = " + " + ".join(terms).replace("+ -", "- ")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# 兼容旧版别名
|
|
230
|
+
CDE_V6_6 = CDE_V80
|