routed-confidence 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- routed_confidence-0.1.0/PKG-INFO +195 -0
- routed_confidence-0.1.0/README.md +1 -0
- routed_confidence-0.1.0/pyproject.toml +25 -0
- routed_confidence-0.1.0/routed_confidence/README.md +181 -0
- routed_confidence-0.1.0/routed_confidence/__init__.py +5 -0
- routed_confidence-0.1.0/routed_confidence/config.py +65 -0
- routed_confidence-0.1.0/routed_confidence/config.yaml +94 -0
- routed_confidence-0.1.0/routed_confidence/evaluator.py +219 -0
- routed_confidence-0.1.0/routed_confidence/internal/__init__.py +1 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/__init__.py +44 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/base.py +33 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/__init__.py +15 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/base_validator.py +216 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/code_injector.py +407 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/common_validator.py +138 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/meeting_validator.py +493 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/register_validator.py +879 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/stock_validator.py +127 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/field_validators/vote_validator.py +212 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/improved_code_manager.py +124 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/rule_based_evaluator.py +229 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/similarity_score.py +303 -0
- routed_confidence-0.1.0/routed_confidence/internal/confidence_evaluators/wilson_evaluator.py +443 -0
- routed_confidence-0.1.0/routed_confidence/internal/pdf_handler.py +21 -0
- routed_confidence-0.1.0/routed_confidence/internal/retrieve_text.py +342 -0
- routed_confidence-0.1.0/routed_confidence/internal/similarity.py +18 -0
- routed_confidence-0.1.0/routed_confidence/rules/meeting_ruler.json +354 -0
- routed_confidence-0.1.0/routed_confidence.egg-info/PKG-INFO +195 -0
- routed_confidence-0.1.0/routed_confidence.egg-info/SOURCES.txt +31 -0
- routed_confidence-0.1.0/routed_confidence.egg-info/dependency_links.txt +1 -0
- routed_confidence-0.1.0/routed_confidence.egg-info/requires.txt +7 -0
- routed_confidence-0.1.0/routed_confidence.egg-info/top_level.txt +1 -0
- routed_confidence-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: routed-confidence
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Field-level confidence evaluator SDK for routed extraction results
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: dotenv>=0.9.9
|
|
8
|
+
Requires-Dist: fastapi>=0.116.1
|
|
9
|
+
Requires-Dist: httpx[socks]>=0.28.1
|
|
10
|
+
Requires-Dist: openai>=1.107.1
|
|
11
|
+
Requires-Dist: pydantic>=2.11.7
|
|
12
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
13
|
+
Requires-Dist: uvicorn>=0.35.0
|
|
14
|
+
|
|
15
|
+
# routed-confidence
|
|
16
|
+
|
|
17
|
+
字段级置信度评估 SDK。
|
|
18
|
+
|
|
19
|
+
这个包只负责一件事:输入一条提取结果,输出每个字段的置信度分数。它不计算检出率、误伤率、日报统计,也不包含 Web、API 服务或自动迭代逻辑。
|
|
20
|
+
|
|
21
|
+
## 位置
|
|
22
|
+
|
|
23
|
+
源码目录:
|
|
24
|
+
|
|
25
|
+
```text
|
|
26
|
+
routed_confidence/
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
主要入口:
|
|
30
|
+
|
|
31
|
+
```text
|
|
32
|
+
routed_confidence/evaluator.py
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
内部评分组件:
|
|
36
|
+
|
|
37
|
+
```text
|
|
38
|
+
routed_confidence/internal/confidence_evaluators/
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
默认规则文件:
|
|
42
|
+
|
|
43
|
+
```text
|
|
44
|
+
routed_confidence/rules/meeting_ruler.json
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 安装
|
|
48
|
+
|
|
49
|
+
在当前仓库本地开发安装:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install -e .
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
从本地 wheel 安装:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python3 -m pip wheel . --no-deps --no-build-isolation -w dist
|
|
59
|
+
pip install dist/routed_confidence-0.1.0-py3-none-any.whl
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
如果发布到 PyPI 或私有 PyPI 后:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install routed-confidence
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Python 导入名使用下划线:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from routed_confidence import ConfidenceEvaluator
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 使用
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from routed_confidence import ConfidenceEvaluator
|
|
78
|
+
|
|
79
|
+
record = {
|
|
80
|
+
"extraction_result": {
|
|
81
|
+
"会议召开时间": {
|
|
82
|
+
"value": "2026-05-20 14:00:00",
|
|
83
|
+
"content": "会议召开时间为2026年5月20日14:00",
|
|
84
|
+
"type": "text",
|
|
85
|
+
"title_path": ["一、会议基本情况"],
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
"input_text": "会议召开时间为2026年5月20日14:00。",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
evaluator = ConfidenceEvaluator()
|
|
92
|
+
result = evaluator.evaluate(record)
|
|
93
|
+
print(result)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
也可以使用一次性函数:
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from routed_confidence import evaluate_confidence
|
|
100
|
+
|
|
101
|
+
result = evaluate_confidence(record)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## 返回格式
|
|
105
|
+
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"field_scores": {
|
|
109
|
+
"会议召开时间": {
|
|
110
|
+
"source_value": "2026-05-20 14:00:00",
|
|
111
|
+
"total_score": 0.75,
|
|
112
|
+
"dimension_scores": {
|
|
113
|
+
"schema": 1.0,
|
|
114
|
+
"similarity": 0.82,
|
|
115
|
+
"historical": 0.5,
|
|
116
|
+
"relation": 1.0
|
|
117
|
+
},
|
|
118
|
+
"reason": "字段符合规则要求",
|
|
119
|
+
"violations": [],
|
|
120
|
+
"suggestions": []
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
"summary": {
|
|
124
|
+
"total_fields": 1,
|
|
125
|
+
"average_score": 0.75
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
注意:SDK 不返回 `is_correct`。`is_correct` 属于“标注数据 vs 提取数据”的评测逻辑,不属于置信度 SDK。
|
|
131
|
+
|
|
132
|
+
## 自定义规则和权重
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from routed_confidence import ConfidenceEvaluator
|
|
136
|
+
|
|
137
|
+
evaluator = ConfidenceEvaluator(
|
|
138
|
+
rules_file="/path/to/meeting_ruler.json",
|
|
139
|
+
weights={
|
|
140
|
+
"schema": 0.1,
|
|
141
|
+
"sim": 0.2,
|
|
142
|
+
"wilson": 0.1,
|
|
143
|
+
"relation": 0.6,
|
|
144
|
+
},
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
如果不传 `rules_file`,SDK 会优先寻找当前项目里的 `schema/ruler/meeting_ruler.json`,找不到时使用包内置的默认规则。
|
|
149
|
+
|
|
150
|
+
## 发布
|
|
151
|
+
|
|
152
|
+
安装构建工具:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pip install build twine
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
构建:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
python -m build
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
会生成:
|
|
165
|
+
|
|
166
|
+
```text
|
|
167
|
+
dist/routed_confidence-0.1.0-py3-none-any.whl
|
|
168
|
+
dist/routed_confidence-0.1.0.tar.gz
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
上传到 PyPI:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
twine upload dist/*
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
上传到私有 PyPI:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
twine upload --repository-url https://your-private-pypi/simple/ dist/*
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
发布后安装:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install routed-confidence
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 当前边界
|
|
190
|
+
|
|
191
|
+
- 只做单条/字段级置信度评估。
|
|
192
|
+
- 不计算检出率。
|
|
193
|
+
- 不计算误伤率。
|
|
194
|
+
- 不依赖 `methods/api`、`methods/web_conf`、`methods/reflect_rule`。
|
|
195
|
+
- 不要求用户导入内部四维组件。
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# RouteD - 数据提取置信度评估API
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "routed-confidence"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Field-level confidence evaluator SDK for routed extraction results"
|
|
5
|
+
readme = "routed_confidence/README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"dotenv>=0.9.9",
|
|
9
|
+
"fastapi>=0.116.1",
|
|
10
|
+
"httpx[socks]>=0.28.1",
|
|
11
|
+
"openai>=1.107.1",
|
|
12
|
+
"pydantic>=2.11.7",
|
|
13
|
+
"pyyaml>=6.0.2",
|
|
14
|
+
"uvicorn>=0.35.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["setuptools>=68", "wheel"]
|
|
19
|
+
build-backend = "setuptools.build_meta"
|
|
20
|
+
|
|
21
|
+
[tool.setuptools.packages.find]
|
|
22
|
+
include = ["routed_confidence*"]
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.package-data]
|
|
25
|
+
routed_confidence = ["config.yaml", "rules/*.json"]
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# routed-confidence
|
|
2
|
+
|
|
3
|
+
字段级置信度评估 SDK。
|
|
4
|
+
|
|
5
|
+
这个包只负责一件事:输入一条提取结果,输出每个字段的置信度分数。它不计算检出率、误伤率、日报统计,也不包含 Web、API 服务或自动迭代逻辑。
|
|
6
|
+
|
|
7
|
+
## 位置
|
|
8
|
+
|
|
9
|
+
源码目录:
|
|
10
|
+
|
|
11
|
+
```text
|
|
12
|
+
routed_confidence/
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
主要入口:
|
|
16
|
+
|
|
17
|
+
```text
|
|
18
|
+
routed_confidence/evaluator.py
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
内部评分组件:
|
|
22
|
+
|
|
23
|
+
```text
|
|
24
|
+
routed_confidence/internal/confidence_evaluators/
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
默认规则文件:
|
|
28
|
+
|
|
29
|
+
```text
|
|
30
|
+
routed_confidence/rules/meeting_ruler.json
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 安装
|
|
34
|
+
|
|
35
|
+
在当前仓库本地开发安装:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
从本地 wheel 安装:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
python3 -m pip wheel . --no-deps --no-build-isolation -w dist
|
|
45
|
+
pip install dist/routed_confidence-0.1.0-py3-none-any.whl
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
如果发布到 PyPI 或私有 PyPI 后:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install routed-confidence
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Python 导入名使用下划线:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from routed_confidence import ConfidenceEvaluator
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## 使用
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from routed_confidence import ConfidenceEvaluator
|
|
64
|
+
|
|
65
|
+
record = {
|
|
66
|
+
"extraction_result": {
|
|
67
|
+
"会议召开时间": {
|
|
68
|
+
"value": "2026-05-20 14:00:00",
|
|
69
|
+
"content": "会议召开时间为2026年5月20日14:00",
|
|
70
|
+
"type": "text",
|
|
71
|
+
"title_path": ["一、会议基本情况"],
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"input_text": "会议召开时间为2026年5月20日14:00。",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
evaluator = ConfidenceEvaluator()
|
|
78
|
+
result = evaluator.evaluate(record)
|
|
79
|
+
print(result)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
也可以使用一次性函数:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from routed_confidence import evaluate_confidence
|
|
86
|
+
|
|
87
|
+
result = evaluate_confidence(record)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## 返回格式
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"field_scores": {
|
|
95
|
+
"会议召开时间": {
|
|
96
|
+
"source_value": "2026-05-20 14:00:00",
|
|
97
|
+
"total_score": 0.75,
|
|
98
|
+
"dimension_scores": {
|
|
99
|
+
"schema": 1.0,
|
|
100
|
+
"similarity": 0.82,
|
|
101
|
+
"historical": 0.5,
|
|
102
|
+
"relation": 1.0
|
|
103
|
+
},
|
|
104
|
+
"reason": "字段符合规则要求",
|
|
105
|
+
"violations": [],
|
|
106
|
+
"suggestions": []
|
|
107
|
+
}
|
|
108
|
+
},
|
|
109
|
+
"summary": {
|
|
110
|
+
"total_fields": 1,
|
|
111
|
+
"average_score": 0.75
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
注意:SDK 不返回 `is_correct`。`is_correct` 属于“标注数据 vs 提取数据”的评测逻辑,不属于置信度 SDK。
|
|
117
|
+
|
|
118
|
+
## 自定义规则和权重
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from routed_confidence import ConfidenceEvaluator
|
|
122
|
+
|
|
123
|
+
evaluator = ConfidenceEvaluator(
|
|
124
|
+
rules_file="/path/to/meeting_ruler.json",
|
|
125
|
+
weights={
|
|
126
|
+
"schema": 0.1,
|
|
127
|
+
"sim": 0.2,
|
|
128
|
+
"wilson": 0.1,
|
|
129
|
+
"relation": 0.6,
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
如果不传 `rules_file`,SDK 会优先寻找当前项目里的 `schema/ruler/meeting_ruler.json`,找不到时使用包内置的默认规则。
|
|
135
|
+
|
|
136
|
+
## 发布
|
|
137
|
+
|
|
138
|
+
安装构建工具:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
pip install build twine
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
构建:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
python -m build
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
会生成:
|
|
151
|
+
|
|
152
|
+
```text
|
|
153
|
+
dist/routed_confidence-0.1.0-py3-none-any.whl
|
|
154
|
+
dist/routed_confidence-0.1.0.tar.gz
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
上传到 PyPI:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
twine upload dist/*
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
上传到私有 PyPI:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
twine upload --repository-url https://your-private-pypi/simple/ dist/*
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
发布后安装:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
pip install routed-confidence
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## 当前边界
|
|
176
|
+
|
|
177
|
+
- 只做单条/字段级置信度评估。
|
|
178
|
+
- 不计算检出率。
|
|
179
|
+
- 不计算误伤率。
|
|
180
|
+
- 不依赖 `methods/api`、`methods/web_conf`、`methods/reflect_rule`。
|
|
181
|
+
- 不要求用户导入内部四维组件。
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Package configuration accessors for routed_confidence."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib import resources
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _load_config() -> dict[str, Any]:
|
|
13
|
+
config_resource = resources.files("routed_confidence").joinpath("config.yaml")
|
|
14
|
+
with config_resource.open("r", encoding="utf-8") as file:
|
|
15
|
+
return yaml.safe_load(file) or {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_config = _load_config()
|
|
19
|
+
|
|
20
|
+
PROJECT_ROOT = str(Path.cwd())
|
|
21
|
+
|
|
22
|
+
wilson_config = _config.get("wilson", {})
|
|
23
|
+
WILSON_Z_SCORES = wilson_config.get("z_scores", {0.95: 1.96})
|
|
24
|
+
DEFAULT_CONFIDENCE_LEVEL = wilson_config.get("default_confidence_level", 0.95)
|
|
25
|
+
DEFAULT_NEUTRAL_CONFIDENCE = wilson_config.get("default_neutral_confidence", 0.5)
|
|
26
|
+
|
|
27
|
+
scoring_config = _config.get("scoring", {})
|
|
28
|
+
SCORE_THRESHOLD_HIGH = scoring_config.get("threshold_high", 0.8)
|
|
29
|
+
SCORE_THRESHOLD_MEDIUM = scoring_config.get("threshold_medium", 0.5)
|
|
30
|
+
SPECIAL_VALUE_CORRECT_SCORE = scoring_config.get("special_value_correct_score", 0.9)
|
|
31
|
+
SPECIAL_VALUE_WARNING_SCORE = scoring_config.get("special_value_warning_score", 0.7)
|
|
32
|
+
DATE_FORMAT_ERROR_SCORE = scoring_config.get("date_format_error_score", 0.3)
|
|
33
|
+
MAPPING_CORRECT_SCORE = scoring_config.get("mapping_correct_score", 0.9)
|
|
34
|
+
|
|
35
|
+
weights_config = _config.get("weights", {})
|
|
36
|
+
REQUIRED_FIELD_WEIGHT = weights_config.get("required_field_weight", 2.0)
|
|
37
|
+
OPTIONAL_FIELD_WEIGHT = weights_config.get("optional_field_weight", 1.0)
|
|
38
|
+
CONTEXT_TYPE_WEIGHTS = weights_config.get("context_type_weights", {})
|
|
39
|
+
DIMENSION_WEIGHTS = weights_config.get(
|
|
40
|
+
"dimension_weights",
|
|
41
|
+
{"schema": 0.1, "sim": 0.2, "wilson": 0.1, "relation": 0.6},
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
date_formats_config = _config.get("date_formats", {})
|
|
45
|
+
SUPPORTED_DATE_FORMATS = date_formats_config.get(
|
|
46
|
+
"supported_formats",
|
|
47
|
+
["%Y-%m-%d %H:%M:%S", "%Y-%m-%d"],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
paths_config = _config.get("paths", {})
|
|
51
|
+
DEFAULT_RULES_FILE_PATH = paths_config.get("default_rules_file", "")
|
|
52
|
+
WILSON_ENTROPY_STATISTIC_FILE_PATH = paths_config.get(
|
|
53
|
+
"wilson_entropy_statistic_file",
|
|
54
|
+
"",
|
|
55
|
+
)
|
|
56
|
+
SIM_SCORE_FILE_PATH = paths_config.get("sim_score_file", "")
|
|
57
|
+
HISTORICAL_DATA_FILE_PATH = paths_config.get("historical_data_file", "")
|
|
58
|
+
STATISTIC_FILE_SUFFIX = paths_config.get("statistic_file_suffix", "_statistic.json")
|
|
59
|
+
|
|
60
|
+
score_reasons_config = _config.get("score_reasons", {})
|
|
61
|
+
SCORE_REASON_HIGH = score_reasons_config.get("high", "字段符合规则要求")
|
|
62
|
+
SCORE_REASON_MEDIUM = score_reasons_config.get("medium", "字段基本符合要求,有轻微问题")
|
|
63
|
+
SCORE_REASON_LOW = score_reasons_config.get("low", "字段存在严重问题")
|
|
64
|
+
|
|
65
|
+
ENABLE_VERBOSE_LOGGING = _config.get("logging", {}).get("enable_verbose_logging", False)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# 置信度评估系统配置文件
|
|
2
|
+
|
|
3
|
+
# Wilson置信区间配置
|
|
4
|
+
wilson:
|
|
5
|
+
# Wilson置信区间的z分数映射
|
|
6
|
+
z_scores:
|
|
7
|
+
0.90: 1.645
|
|
8
|
+
0.95: 1.96
|
|
9
|
+
0.99: 2.576
|
|
10
|
+
|
|
11
|
+
# 默认置信水平
|
|
12
|
+
default_confidence_level: 0.95
|
|
13
|
+
|
|
14
|
+
# 无数据时的默认中性置信度
|
|
15
|
+
default_neutral_confidence: 0.5
|
|
16
|
+
|
|
17
|
+
# 规则评分配置
|
|
18
|
+
scoring:
|
|
19
|
+
# 评分阈值
|
|
20
|
+
threshold_high: 0.8 # 高分阈值(字段符合规则要求)
|
|
21
|
+
threshold_medium: 0.5 # 中等分阈值(字段基本符合要求)
|
|
22
|
+
|
|
23
|
+
# 特殊值处理分数
|
|
24
|
+
special_value_correct_score: 0.9 # 特殊值正确处理的分数
|
|
25
|
+
special_value_warning_score: 0.7 # 特殊值可能处理不当的分数
|
|
26
|
+
|
|
27
|
+
# 日期格式错误的惩罚分数
|
|
28
|
+
date_format_error_score: 0.3
|
|
29
|
+
|
|
30
|
+
# 值映射正确的分数
|
|
31
|
+
mapping_correct_score: 0.9
|
|
32
|
+
|
|
33
|
+
# 权重配置
|
|
34
|
+
weights:
|
|
35
|
+
# 字段重要性权重
|
|
36
|
+
required_field_weight: 2.0 # 必填字段权重
|
|
37
|
+
optional_field_weight: 1.0 # 非必填字段权重
|
|
38
|
+
|
|
39
|
+
# 上下文类型权重
|
|
40
|
+
context_type_weights:
|
|
41
|
+
table: 0.1
|
|
42
|
+
text: 0.05
|
|
43
|
+
title: 0.02
|
|
44
|
+
# 不同维度权重
|
|
45
|
+
dimension_weights:
|
|
46
|
+
schema: 0.10
|
|
47
|
+
sim: 0.2
|
|
48
|
+
wilson: 0.10
|
|
49
|
+
relation: 0.60
|
|
50
|
+
# 日期格式配置
|
|
51
|
+
date_formats:
|
|
52
|
+
# 支持的日期格式列表
|
|
53
|
+
supported_formats:
|
|
54
|
+
- "%Y-%m-%d %H:%M:%S" # 2025-05-16 14:30:00
|
|
55
|
+
- "%Y-%m-%d" # 2025-05-16
|
|
56
|
+
- "%Y/%m/%d %H:%M:%S" # 2025/05/16 14:30:00
|
|
57
|
+
- "%Y/%m/%d" # 2025/05/16
|
|
58
|
+
|
|
59
|
+
# 文件路径配置
|
|
60
|
+
paths:
|
|
61
|
+
# 项目根目录将在运行时动态计算
|
|
62
|
+
# 默认规则文件路径
|
|
63
|
+
default_rules_file: "schema/ruler/meeting_ruler.json"
|
|
64
|
+
|
|
65
|
+
# wilson&entropy 统计文件路径
|
|
66
|
+
wilson_entropy_statistic_file: "methods/data/wilson&ent"
|
|
67
|
+
|
|
68
|
+
# 历史文件路径
|
|
69
|
+
historical_data_file: "data/wrong_pdf/10.22/test_data_fusion.json"
|
|
70
|
+
|
|
71
|
+
# 统计文件后缀
|
|
72
|
+
statistic_file_suffix: "_statistic.json"
|
|
73
|
+
|
|
74
|
+
# 测试数据文件路径
|
|
75
|
+
test_data_file: "data/wrong_pdf/ori/10.24_before.json"
|
|
76
|
+
|
|
77
|
+
# 相似度分数文件路径
|
|
78
|
+
sim_score_file: "methods/data/sim"
|
|
79
|
+
|
|
80
|
+
# schema文件路径(用于AI评估)
|
|
81
|
+
schema_file: "schema/meeting.json"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# 评分理由模板
|
|
86
|
+
score_reasons:
|
|
87
|
+
high: "字段符合规则要求"
|
|
88
|
+
medium: "字段基本符合要求,有轻微问题"
|
|
89
|
+
low: "字段存在严重问题"
|
|
90
|
+
|
|
91
|
+
# 日志配置
|
|
92
|
+
logging:
|
|
93
|
+
# 是否启用详细日志
|
|
94
|
+
enable_verbose_logging: true
|