xforge 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xforge-0.4.3/PKG-INFO +10 -0
- xforge-0.4.3/README.md +198 -0
- xforge-0.4.3/pyproject.toml +25 -0
- xforge-0.4.3/scripts/export_dsl.py +60 -0
- xforge-0.4.3/scripts/generate_parquet_sql.py +195 -0
- xforge-0.4.3/scripts/generate_rule_manual.py +1091 -0
- xforge-0.4.3/scripts/import_rules.py +103 -0
- xforge-0.4.3/scripts/import_sql.py +167 -0
- xforge-0.4.3/scripts/import_sql_files.py +126 -0
- xforge-0.4.3/scripts/rewrite_sql_duckdb.py +271 -0
- xforge-0.4.3/scripts/run_center.py +286 -0
- xforge-0.4.3/setup.cfg +4 -0
- xforge-0.4.3/src/__init__.py +0 -0
- xforge-0.4.3/src/app.py +22 -0
- xforge-0.4.3/src/config.py +119 -0
- xforge-0.4.3/src/database/__init__.py +0 -0
- xforge-0.4.3/src/database/connection.py +57 -0
- xforge-0.4.3/src/database/migrate.py +29 -0
- xforge-0.4.3/src/models/__init__.py +0 -0
- xforge-0.4.3/src/models/alert.py +161 -0
- xforge-0.4.3/src/models/analytics.py +218 -0
- xforge-0.4.3/src/models/data_source.py +78 -0
- xforge-0.4.3/src/models/execution.py +142 -0
- xforge-0.4.3/src/models/model_category.py +63 -0
- xforge-0.4.3/src/models/rule.py +380 -0
- xforge-0.4.3/src/models/rule_version.py +69 -0
- xforge-0.4.3/src/services/__init__.py +0 -0
- xforge-0.4.3/src/services/execution_service.py +218 -0
- xforge-0.4.3/src/services/export_service.py +75 -0
- xforge-0.4.3/src/services/import_service.py +151 -0
- xforge-0.4.3/src/services/sql_adapter.py +440 -0
- xforge-0.4.3/src/tui/app.py +216 -0
- xforge-0.4.3/src/tui/screens/__init__.py +0 -0
- xforge-0.4.3/src/tui/screens/alert_detail.py +194 -0
- xforge-0.4.3/src/tui/screens/execution_detail.py +131 -0
- xforge-0.4.3/src/tui/screens/help_screen.py +85 -0
- xforge-0.4.3/src/tui/screens/import_screen.py +67 -0
- xforge-0.4.3/src/tui/screens/rule_detail.py +178 -0
- xforge-0.4.3/src/tui/screens/rule_edit.py +179 -0
- xforge-0.4.3/src/tui/startup_check.py +128 -0
- xforge-0.4.3/src/tui/tabs/__init__.py +0 -0
- xforge-0.4.3/src/tui/tabs/alert.py +315 -0
- xforge-0.4.3/src/tui/tabs/analytics.py +256 -0
- xforge-0.4.3/src/tui/tabs/approval.py +170 -0
- xforge-0.4.3/src/tui/tabs/dashboard.py +174 -0
- xforge-0.4.3/src/tui/tabs/execution.py +234 -0
- xforge-0.4.3/src/tui/tabs/query.py +544 -0
- xforge-0.4.3/src/tui/tabs/settings.py +314 -0
- xforge-0.4.3/src/tui/widgets/__init__.py +0 -0
- xforge-0.4.3/src/utils/__init__.py +0 -0
- xforge-0.4.3/src/utils/csv_importer.py +228 -0
- xforge-0.4.3/src/utils/dsl_generator.py +223 -0
- xforge-0.4.3/src/utils/excel_parser.py +290 -0
- xforge-0.4.3/xforge.egg-info/PKG-INFO +10 -0
- xforge-0.4.3/xforge.egg-info/SOURCES.txt +57 -0
- xforge-0.4.3/xforge.egg-info/dependency_links.txt +1 -0
- xforge-0.4.3/xforge.egg-info/entry_points.txt +5 -0
- xforge-0.4.3/xforge.egg-info/requires.txt +5 -0
- xforge-0.4.3/xforge.egg-info/top_level.txt +2 -0
xforge-0.4.3/PKG-INFO
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xforge
|
|
3
|
+
Version: 0.4.3
|
|
4
|
+
Summary: XForge — 穿透式监管规则锻造平台 | Penetrating Supervision Rule Forge
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: textual>=0.52
|
|
7
|
+
Requires-Dist: duckdb>=0.9
|
|
8
|
+
Requires-Dist: polars>=0.19
|
|
9
|
+
Requires-Dist: openpyxl>=3.1
|
|
10
|
+
Requires-Dist: pyyaml>=6.0
|
xforge-0.4.3/README.md
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# XForge — 穿透式监管规则全生命周期管理平台
|
|
2
|
+
|
|
3
|
+
规则的全生命周期管理中心:Excel 导入 → 状态匹配 → 编写 SQL → 审批部署 → DuckDB 直接执行 → 预警工单。
|
|
4
|
+
|
|
5
|
+
**执行架构:** DuckDB in-process 直接执行 SQL,不再通过 subprocess 调用外部 regula。DSL 导出作为兼容层保留。
|
|
6
|
+
|
|
7
|
+
## 快速开始
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
git clone ssh://git@codeberg.org/songwupei/RuleForge.git
|
|
11
|
+
cd RuleForge
|
|
12
|
+
cp config.yaml.example config.yaml # 按需修改 Excel 路径和数据绑定
|
|
13
|
+
pip install -e "."
|
|
14
|
+
|
|
15
|
+
# 导入规则(从规则手册 Excel)
|
|
16
|
+
python scripts/import_rules.py --excel --clear
|
|
17
|
+
|
|
18
|
+
# 启动 TUI
|
|
19
|
+
python -m src.app
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 依赖
|
|
23
|
+
|
|
24
|
+
| 依赖 | 要求 |
|
|
25
|
+
|------|------|
|
|
26
|
+
| textual | ≥0.52 |
|
|
27
|
+
| duckdb | ≥0.9 |
|
|
28
|
+
| polars | ≥0.19 |
|
|
29
|
+
| openpyxl | ≥3.1 |
|
|
30
|
+
| pyyaml | ≥6.0 |
|
|
31
|
+
|
|
32
|
+
## 界面
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
┌─ 统计面板 (可拖拽) ─┬─ 快速筛选 ─────────────────────────────┐
|
|
36
|
+
│ 📊 筛选统计 │ [问题乱象▼] [状态▼] │
|
|
37
|
+
│ │ 📋 条件筛选 │
|
|
38
|
+
│ 按问题乱象 │ [字段▼] [运算符▼] [值___] [✕] │
|
|
39
|
+
│ 靠企吃企 6 ████ │ [+条件] [搜索] [重置] │
|
|
40
|
+
│ 过度负债 11 ████ │ ───────────────────────────────────── │
|
|
41
|
+
│ ... │ │模型代码│模型名称│问题乱象│规则序号│..│
|
|
42
|
+
│ │ │KQCQ_11│假期开票│靠企吃企│KQCQ_11_1│ │
|
|
43
|
+
│ 按状态 │ │... │... │... │... │ │
|
|
44
|
+
│ 🟢生效中 19 ████ │ │
|
|
45
|
+
│ 🔍审核中 28 ████ │ │
|
|
46
|
+
│ 📝草稿 131 ████ │ │
|
|
47
|
+
└─────────────────────┴────────────────────────────────────────┘
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## 快捷键
|
|
51
|
+
|
|
52
|
+
| 键 | 功能 |
|
|
53
|
+
|----|------|
|
|
54
|
+
| `Enter` | 查看规则详情(6维度 + 版本历史) |
|
|
55
|
+
| `e` | 编辑规则 |
|
|
56
|
+
| `n` | 新建规则 |
|
|
57
|
+
| `a` | 推进状态(draft→review→approved→deployed→active) |
|
|
58
|
+
| `s` | 暂停 |
|
|
59
|
+
| `r` | 回退 |
|
|
60
|
+
| `Backspace` | 废弃 |
|
|
61
|
+
| `f` | 聚焦条件筛选 |
|
|
62
|
+
| `[` / `]` | 面板变窄/变宽 |
|
|
63
|
+
| 鼠标拖 `┃` | 调整面板宽度 |
|
|
64
|
+
| `x` | 执行规则(异步,不阻塞 UI) |
|
|
65
|
+
| `t` | 执行历史:全部/汇总切换 |
|
|
66
|
+
|
|
67
|
+
| `ctrl+1~6` | 切换 Tab |
|
|
68
|
+
| `?` | 帮助 |
|
|
69
|
+
|
|
70
|
+
## 命令行工具
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# 导入规则(从 Excel 规则手册,--clear 清空重导)
|
|
74
|
+
python scripts/import_rules.py --excel --clear
|
|
75
|
+
|
|
76
|
+
# 导入 SQL 文件(按 sub_code 匹配)
|
|
77
|
+
python scripts/import_sql_files.py
|
|
78
|
+
|
|
79
|
+
# 执行中心(交互式多选 + 实时进度 + 自动备份 + 日志)
|
|
80
|
+
python scripts/run_center.py # 交互选择
|
|
81
|
+
python scripts/run_center.py --all # 全部有 SQL 的规则
|
|
82
|
+
python scripts/run_center.py --all --clear # 清空旧数据后全部执行
|
|
83
|
+
|
|
84
|
+
# 改写 MySQL SQL 为 DuckDB 语法
|
|
85
|
+
python scripts/rewrite_sql_duckdb.py
|
|
86
|
+
|
|
87
|
+
# 导出 DSL
|
|
88
|
+
python scripts/export_dsl.py [--model 靠企吃企] [--status active]
|
|
89
|
+
|
|
90
|
+
# 生成规则手册(7 步模板 + AI 增强 + PDF/DOCX,默认全部开启)
|
|
91
|
+
python scripts/generate_rule_manual.py --all # 默认:AI, -ai后缀, PDF+DOCX, 已有跳过
|
|
92
|
+
python scripts/generate_rule_manual.py --all --no-ai # 仅 SQL 解析,不调 AI
|
|
93
|
+
python scripts/generate_rule_manual.py --all --no-pdf # 仅生成 DOCX
|
|
94
|
+
python scripts/generate_rule_manual.py --all --suffix '' # 文件名不加 -ai 后缀
|
|
95
|
+
python scripts/generate_rule_manual.py --all --force # 强制重生成 md
|
|
96
|
+
|
|
97
|
+
# 一键生成全部规则手册(AI 增强 + PDF/DOCX + 自动备份)
|
|
98
|
+
./make_all_manual.sh # 全部流程
|
|
99
|
+
./make_all_manual.sh --no-ai # 跳过 AI
|
|
100
|
+
./make_all_manual.sh --no-pdf # 仅生成 md + docx
|
|
101
|
+
./make_all_manual.sh --rule KQCQ_12 # 仅指定规则
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
> **规则手册生成**:`generate_rule_manual.py` 从数据库读取规则 / SQL / 执行历史,按 7 步模板生成 Markdown。默认启用 AI 从 SQL 反向解读业务逻辑(`$OPENAI_API_KEY` + `$OPENAI_BASE_URL` 自动适配 DeepSeek/OpenAI),`--no-ai` 关闭。默认同时输出 PDF + DOCX,`--no-pdf` / `--no-docx` 排除。已有文件自动跳过,`--force` 强制重生成。`make_all_manual.sh` 一键执行 + 自动备份到 `data/manual/backups/`。
|
|
105
|
+
|
|
106
|
+
## 规则状态流转
|
|
107
|
+
|
|
108
|
+
规则从录入到退役的完整生命周期,共 8 种状态:
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
draft ──→ review ──→ approved ──→ deployed ──→ active ⇄ suspended
|
|
112
|
+
↑ ↑ ↑ ↑ ↑
|
|
113
|
+
└──────────┴───────────┴────────────┴───────────┘ (r 回退)
|
|
114
|
+
↑
|
|
115
|
+
deprecated ──→ retired
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
| 状态 | 图标 | 含义 | 判定来源 |
|
|
119
|
+
|------|------|------|----------|
|
|
120
|
+
| `draft` | 📝 | 草稿 | 默认状态;开发状态未匹配到其他规则 |
|
|
121
|
+
| `review` | 🔍 | 审核中 | Excel 开发状态含「待主责部门确认」(28 条) |
|
|
122
|
+
| `active` | 🟢 | 生效中 | Excel 开发状态含「已上线」(19 条) |
|
|
123
|
+
|
|
124
|
+
> **状态初值由 Excel 概览 sheet 的「开发状态」字段决定**,通过 `config.yaml` 的 `status_rules` 关键词匹配。其余状态(approved/deployed/suspended/deprecated/retired)在 TUI 中手动流转。
|
|
125
|
+
|
|
126
|
+
**推进路径(`a`):** draft → review → approved → deployed → active
|
|
127
|
+
**回退(`r`):** 任一步骤逆向前一步
|
|
128
|
+
**废弃(`Backspace`):** → deprecated → retired
|
|
129
|
+
|
|
130
|
+
## 6维度规则模型
|
|
131
|
+
|
|
132
|
+
| 维度 | 内容 | 示例 |
|
|
133
|
+
|------|------|------|
|
|
134
|
+
| 1. 管理要求 | 国资委文件/内部制度 | 国资委《关于加强中央企业资金管理有关事项的通知》 |
|
|
135
|
+
| 2. 规则解读 | 业务语言翻译 | 集团合并货币资金<50亿 且 货币资金/带息负债<3% |
|
|
136
|
+
| 3. 系统及字段 | 数据源、表名、字段 | 司库系统 → account_balance → 货币资金余额 |
|
|
137
|
+
| 4. SQL代码 | 可执行查询 | `SELECT ... WHERE cash < 50e8` |
|
|
138
|
+
| 5. 输出结果 | 预警记录结构 | alert_record(rule_id, corp_code, metric_value) |
|
|
139
|
+
| 6. 输出解读 | 业务核查指南 | 比值<3%表明现金紧张,需核查资金归集情况 |
|
|
140
|
+
|
|
141
|
+
## 配置说明
|
|
142
|
+
|
|
143
|
+
`config.yaml` 核心配置项:
|
|
144
|
+
|
|
145
|
+
```yaml
|
|
146
|
+
# Excel 导入
|
|
147
|
+
excel:
|
|
148
|
+
excluded_sheets: [封皮, 封面, ...] # 跳过的 sheet
|
|
149
|
+
excluded_models: [军品工资奖金套取, ...] # 排除的问题乱象
|
|
150
|
+
column_mapping: {规则代码: rule_code, ...} # 默认列映射
|
|
151
|
+
sheet_mappings: # 特定 sheet 覆盖映射
|
|
152
|
+
概览: {模型: model_name, 开发状态: dev_status, ...}
|
|
153
|
+
status_rules: # 开发状态→生命周期状态
|
|
154
|
+
- keyword: "已上线"
|
|
155
|
+
status: "active"
|
|
156
|
+
- keyword: "待主责部门确认"
|
|
157
|
+
status: "review"
|
|
158
|
+
|
|
159
|
+
# UI 术语
|
|
160
|
+
terminology:
|
|
161
|
+
model: "问题乱象"
|
|
162
|
+
rule_code: "模型代码"
|
|
163
|
+
rule_name: "模型名称"
|
|
164
|
+
rule_interpretation: "规则要点"
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Roadmap
|
|
168
|
+
|
|
169
|
+
### ✅ v0.3.1 — 全部完成
|
|
170
|
+
|
|
171
|
+
- [x] 6 Tab TUI(仪表盘/查询/审批/执行/预警/分析)
|
|
172
|
+
- [x] 执行中心(交互式多选 + 实时进度 + DB 自动备份 + 日志)
|
|
173
|
+
- [x] 分析 Tab(覆盖率/触发率/SLA 时效/执行趋势 + 手动更新)
|
|
174
|
+
- [x] 仪表盘积压告警(>7/14 天红色标记)
|
|
175
|
+
- [x] 执行历史全部/汇总切换(t 键)
|
|
176
|
+
- [x] 异步执行(不阻塞 UI)+ 耗时用分秒显示
|
|
177
|
+
- [x] 概览 sheet 独立列映射 + 开发状态→生命周期状态自动匹配
|
|
178
|
+
- [x] 配置驱动的术语系统 + 排除清单
|
|
179
|
+
- [x] 完整子代码(sub_code)支持
|
|
180
|
+
- [x] 延续行 logic 拼接(不丢多行规则解析)
|
|
181
|
+
- [x] Textual 6.x/8.x NoSelection 兼容
|
|
182
|
+
- [x] KQCQ_10_2 SQL 优化 (390s→4s, CTE+coalesce)
|
|
183
|
+
- [x] 跨批次去重 (同批次不去重, 不同批次相同数据跳过)
|
|
184
|
+
- [x] 设置 Tab (一键备份/还原 + 执行中心快捷入口)
|
|
185
|
+
- [x] 启动环境检测 (startup_check)
|
|
186
|
+
|
|
187
|
+
### ✅ v0.3.4 — 全部完成
|
|
188
|
+
|
|
189
|
+
- [x] 字段名映射表:从附件1 parquet 完整提取(80 表 1679 字段,覆盖率 62%→100%),CSV 格式优先加载
|
|
190
|
+
- [x] parquet 直查 SQL(`rules_dsl/sql/parquet/`):`generate_parquet_sql.py` 自动生成,列名中文 + `read_parquet()` 自包含
|
|
191
|
+
- [x] 设置 Tab:SQL 导入 + MySQL→DuckDB 转换 + 映射表状态/刷新缓存
|
|
192
|
+
- [x] 审批/执行/预警三个 Tab 新增「规则序号」列
|
|
193
|
+
- [x] 预警工单批量多选修复(`update_cell` 单格更新,不重建表格)
|
|
194
|
+
- [x] Header 显示版本号(自动读取 pyproject.toml)
|
|
195
|
+
- [x] SQL 导入过滤 TODO 占位文件
|
|
196
|
+
- [x] sql_adapter 精确匹配优先 + 模糊匹配日志 + `发票代码→fpdm, 单位→dw` 修正
|
|
197
|
+
- [x] filepulse 自动同步映射表到 RuleForge(kedro hook 集成)
|
|
198
|
+
- [x] 删除重复 `data/rules_dsl/` 目录
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "xforge"
|
|
3
|
+
version = "0.4.3"
|
|
4
|
+
description = "XForge — 穿透式监管规则锻造平台 | Penetrating Supervision Rule Forge"
|
|
5
|
+
requires-python = ">=3.10"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"textual>=0.52",
|
|
8
|
+
"duckdb>=0.9",
|
|
9
|
+
"polars>=0.19",
|
|
10
|
+
"openpyxl>=3.1",
|
|
11
|
+
"pyyaml>=6.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.scripts]
|
|
15
|
+
xforge = "src.app:main"
|
|
16
|
+
xforge-import = "scripts.import_rules:main"
|
|
17
|
+
xforge-export = "scripts.export_dsl:main"
|
|
18
|
+
xforge-manual = "scripts.generate_rule_manual:main"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["setuptools>=68"]
|
|
22
|
+
build-backend = "setuptools.build_meta"
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.packages.find]
|
|
25
|
+
include = ["src*", "scripts*"]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""一键导出:SQLite 规则 → regula DSL JSON 文件。
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python scripts/export_dsl.py
|
|
6
|
+
python scripts/export_dsl.py --model 靠企吃企
|
|
7
|
+
python scripts/export_dsl.py --status approved
|
|
8
|
+
python scripts/export_dsl.py --rule-id 1 --rule-id 2
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
17
|
+
if str(_PROJECT_ROOT) not in sys.path:
|
|
18
|
+
sys.path.insert(0, str(_PROJECT_ROOT))
|
|
19
|
+
|
|
20
|
+
from src.database.connection import init_db
|
|
21
|
+
from src.services.export_service import export_to_dsl
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main() -> None:
|
|
25
|
+
import argparse
|
|
26
|
+
|
|
27
|
+
parser = argparse.ArgumentParser(
|
|
28
|
+
description="导出规则 → regula DSL JSON"
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("--model", "-m", type=str, help="按模型筛选")
|
|
31
|
+
parser.add_argument("--status", "-s", type=str, help="按状态筛选")
|
|
32
|
+
parser.add_argument("--rule-id", type=int, action="append", dest="rule_ids",
|
|
33
|
+
help="指定规则 ID(可重复)")
|
|
34
|
+
args = parser.parse_args()
|
|
35
|
+
|
|
36
|
+
init_db()
|
|
37
|
+
|
|
38
|
+
result = export_to_dsl(
|
|
39
|
+
rule_ids=args.rule_ids,
|
|
40
|
+
model=args.model,
|
|
41
|
+
status=args.status,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
print(f"[export] 完成: 生成 {result.generated}, 失败 {result.failed}")
|
|
45
|
+
if result.files:
|
|
46
|
+
print(f"[export] 输出目录: {result.files[0].parent}")
|
|
47
|
+
for f in result.files[:10]:
|
|
48
|
+
print(f" - {f.name}")
|
|
49
|
+
if len(result.files) > 10:
|
|
50
|
+
print(f" ... 共 {len(result.files)} 个文件")
|
|
51
|
+
|
|
52
|
+
if result.errors:
|
|
53
|
+
print(f"\n[export] 错误 ({len(result.errors)}):")
|
|
54
|
+
for err in result.errors[:10]:
|
|
55
|
+
print(f" - {err}")
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
main()
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""从 rules_dsl/sql/*.sql 生成 parquet 直查版 SQL → rules_dsl/sql/parquet/
|
|
3
|
+
|
|
4
|
+
转换规则:
|
|
5
|
+
- ml_kqcq_zzsfp: 拼音别名 → 中文 parquet 列名 (read_parquet)
|
|
6
|
+
- ml_cd_company / ml_base_fdjjr / ml_base_fdjjr_bx: 列名不变
|
|
7
|
+
- 保留 DuckDB 语法 (string_agg, regexp_matches, any_value 等)
|
|
8
|
+
- 跳过注释行,避免旧代码块被误转换
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
18
|
+
if str(_PROJECT_ROOT) not in sys.path:
|
|
19
|
+
sys.path.insert(0, str(_PROJECT_ROOT))
|
|
20
|
+
|
|
21
|
+
import polars as pl
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
from src.services.sql_adapter import build_column_map
|
|
25
|
+
|
|
26
|
+
SQL_DIR = _PROJECT_ROOT / "rules_dsl" / "sql"
|
|
27
|
+
PARQUET_OUT_DIR = SQL_DIR / "parquet"
|
|
28
|
+
CONFIG_PATH = _PROJECT_ROOT / "config.yaml"
|
|
29
|
+
|
|
30
|
+
# 补充修正:build_column_map 模糊匹配会误伤的列
|
|
31
|
+
# parquet 列名 → 正确拼音别名(覆盖 build_column_map 的错误结果)
|
|
32
|
+
_ALIAS_FIXES: dict[str, dict[str, str]] = {
|
|
33
|
+
"ml_kqcq_zzsfp": {
|
|
34
|
+
"发票代码": "fpdm", # 发票代码,模糊匹配错映射到 fphm(发票号码)
|
|
35
|
+
"单位": "dw", # 计量单位,模糊匹配错映射到 xfdwmc(销方名称)
|
|
36
|
+
},
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def load_config() -> dict:
|
|
41
|
+
with open(CONFIG_PATH, encoding="utf-8") as f:
|
|
42
|
+
return yaml.safe_load(f) or {}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_subquery(table_name: str, parquet_path: str) -> str:
|
|
46
|
+
"""为一张表生成 read_parquet 子查询,保证别名唯一。
|
|
47
|
+
|
|
48
|
+
返回: 子查询 SQL 文本
|
|
49
|
+
格式:
|
|
50
|
+
(SELECT "开票日期" AS kpsj, ... FROM read_parquet('/path'))
|
|
51
|
+
"""
|
|
52
|
+
pq_path = str(Path(parquet_path).resolve())
|
|
53
|
+
df = pl.read_parquet(pq_path)
|
|
54
|
+
parquet_cols = list(df.columns)
|
|
55
|
+
fixes = _ALIAS_FIXES.get(table_name, {})
|
|
56
|
+
|
|
57
|
+
select_parts = []
|
|
58
|
+
used_aliases: set[str] = set()
|
|
59
|
+
dup_count: dict[str, int] = {}
|
|
60
|
+
|
|
61
|
+
if table_name == "ml_kqcq_zzsfp":
|
|
62
|
+
# 直接用 sql_adapter.build_column_map(读取 _MAPPING_FILE + 手动修正 + 模糊匹配)
|
|
63
|
+
col_map = build_column_map(table_name, parquet_cols)
|
|
64
|
+
|
|
65
|
+
for pc in parquet_cols:
|
|
66
|
+
# 覆盖修正优先
|
|
67
|
+
if pc in fixes:
|
|
68
|
+
alias = fixes[pc]
|
|
69
|
+
else:
|
|
70
|
+
alias = col_map.get(pc, pc)
|
|
71
|
+
|
|
72
|
+
# 去重:同一别名出现多次时,第一个保留,后续加 _2, _3...
|
|
73
|
+
if alias in used_aliases:
|
|
74
|
+
dup_count[alias] = dup_count.get(alias, 1) + 1
|
|
75
|
+
alias = f"{alias}_{dup_count[alias]}"
|
|
76
|
+
used_aliases.add(alias)
|
|
77
|
+
|
|
78
|
+
select_parts.append(f' "{pc}" AS {alias}')
|
|
79
|
+
else:
|
|
80
|
+
for pc in parquet_cols:
|
|
81
|
+
select_parts.append(f' "{pc}" AS "{pc}"')
|
|
82
|
+
|
|
83
|
+
col_list = ",\n".join(select_parts)
|
|
84
|
+
return (
|
|
85
|
+
f"(\n"
|
|
86
|
+
f" SELECT\n"
|
|
87
|
+
f"{col_list}\n"
|
|
88
|
+
f" FROM read_parquet('{pq_path}')\n"
|
|
89
|
+
f")"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _is_comment_line(line: str) -> bool:
|
|
94
|
+
"""判断是否为纯注释行。"""
|
|
95
|
+
stripped = line.strip()
|
|
96
|
+
return stripped.startswith('--') or stripped == ''
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def replace_table_refs(sql: str, data_bindings: dict[str, str]) -> str:
|
|
100
|
+
"""替换 SQL 中所有表引用为 read_parquet 子查询(跳过注释行)。"""
|
|
101
|
+
lines = sql.split('\n')
|
|
102
|
+
result_lines: list[str] = []
|
|
103
|
+
|
|
104
|
+
for line in lines:
|
|
105
|
+
if _is_comment_line(line):
|
|
106
|
+
result_lines.append(line)
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
modified = line
|
|
110
|
+
for table_name, pq_path in sorted(
|
|
111
|
+
data_bindings.items(), key=lambda x: -len(x[0])
|
|
112
|
+
):
|
|
113
|
+
subquery = build_subquery(table_name, pq_path)
|
|
114
|
+
|
|
115
|
+
# 模式: (FROM|JOIN) table_name alias → 替换
|
|
116
|
+
pattern = re.compile(
|
|
117
|
+
rf'\b((?:FROM|JOIN)\s+){re.escape(table_name)}\s+(\w+)\b',
|
|
118
|
+
re.IGNORECASE,
|
|
119
|
+
)
|
|
120
|
+
modified = pattern.sub(
|
|
121
|
+
lambda m, sq=subquery: (
|
|
122
|
+
f"{m.group(1)}{sq} AS {m.group(2)}"
|
|
123
|
+
),
|
|
124
|
+
modified,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# 模式: FROM table_name 无别名 (CTE 内等) → 自动生成别名
|
|
128
|
+
pattern_no_alias = re.compile(
|
|
129
|
+
rf'\b(FROM\s+){re.escape(table_name)}\b(?!\s+\w)',
|
|
130
|
+
re.IGNORECASE,
|
|
131
|
+
)
|
|
132
|
+
modified = pattern_no_alias.sub(
|
|
133
|
+
lambda m, sq=subquery, tn=table_name: (
|
|
134
|
+
f"{m.group(1)}{sq} AS _{tn}"
|
|
135
|
+
),
|
|
136
|
+
modified,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
result_lines.append(modified)
|
|
140
|
+
|
|
141
|
+
return '\n'.join(result_lines)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _strip_comments(sql: str) -> str:
|
|
145
|
+
"""去掉 SQL 注释行,返回纯净的 SQL 语句。"""
|
|
146
|
+
lines = []
|
|
147
|
+
for line in sql.split('\n'):
|
|
148
|
+
stripped = line.strip()
|
|
149
|
+
if stripped.startswith('--'):
|
|
150
|
+
continue
|
|
151
|
+
lines.append(line)
|
|
152
|
+
return '\n'.join(lines).strip().rstrip(';')
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main() -> None:
|
|
156
|
+
config = load_config()
|
|
157
|
+
data_bindings = config.get("data_bindings", {})
|
|
158
|
+
if not data_bindings:
|
|
159
|
+
print("[generate] config.yaml 中无 data_bindings 配置,退出")
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
PARQUET_OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
|
|
164
|
+
sql_files = sorted(SQL_DIR.glob("*.sql"))
|
|
165
|
+
generated = 0
|
|
166
|
+
skipped = 0
|
|
167
|
+
|
|
168
|
+
for fp in sql_files:
|
|
169
|
+
sql_text = fp.read_text(encoding="utf-8").strip()
|
|
170
|
+
|
|
171
|
+
# 跳过 TODO 占位文件
|
|
172
|
+
sql_no_comments = _strip_comments(sql_text)
|
|
173
|
+
if sql_no_comments in ("SELECT 1", "SELECT 1;", "SELECT 1;"):
|
|
174
|
+
skipped += 1
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
# 生成 parquet 版
|
|
178
|
+
header = (
|
|
179
|
+
f"-- Parquet 直查版: {fp.name}\n"
|
|
180
|
+
f"-- 来源: rules_dsl/sql/{fp.name}\n"
|
|
181
|
+
f"-- 可直接在 DuckDB 中执行,无需依赖项目 VIEW\n"
|
|
182
|
+
f"-- 生成脚本: scripts/generate_parquet_sql.py\n\n"
|
|
183
|
+
)
|
|
184
|
+
transformed = replace_table_refs(sql_text, data_bindings)
|
|
185
|
+
out_path = PARQUET_OUT_DIR / fp.name
|
|
186
|
+
out_path.write_text(header + transformed + "\n", encoding="utf-8")
|
|
187
|
+
print(f" [OK] {fp.name} → parquet/{fp.name}")
|
|
188
|
+
generated += 1
|
|
189
|
+
|
|
190
|
+
print(f"\n[generate] 生成 {generated} 个, 跳过 {skipped} 个 (TODO/SELECT 1)")
|
|
191
|
+
print(f"[generate] 输出目录: {PARQUET_OUT_DIR}")
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
if __name__ == "__main__":
|
|
195
|
+
main()
|