@chongyan/autospec 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.en.md +472 -0
- package/README.md +476 -0
- package/bin/autospec.js +3 -0
- package/knowledge/README.md +144 -0
- package/knowledge/checklists/code.md +182 -0
- package/knowledge/checklists/design.md +196 -0
- package/knowledge/checklists/release.md +70 -0
- package/knowledge/checklists/requirement.md +169 -0
- package/knowledge/checklists/test.md +46 -0
- package/knowledge/config/README.en.md +44 -0
- package/knowledge/config/README.md +44 -0
- package/knowledge/config/role-composition.yaml +98 -0
- package/knowledge/config/role-extensions.yaml +140 -0
- package/knowledge/config/skill-compositions.yaml +142 -0
- package/knowledge/config/team-stage.yaml +95 -0
- package/knowledge/config/team-tasks.yaml +139 -0
- package/knowledge/config/team-triggers.yaml +198 -0
- package/knowledge/config/validation-patterns.yaml +137 -0
- package/knowledge/domain/README.md +115 -0
- package/knowledge/domain/flows/README.md +194 -0
- package/knowledge/domain/glossary.md +143 -0
- package/knowledge/domain/rules.md +138 -0
- package/knowledge/environment/README.en.md +36 -0
- package/knowledge/environment/README.md +87 -0
- package/knowledge/environment/component-knowledge.md +316 -0
- package/knowledge/environment/detection-patterns.yaml +502 -0
- package/knowledge/environment/middleware-knowledge.md +237 -0
- package/knowledge/environment/template-registry.md +321 -0
- package/knowledge/guides/domain-driven-design.md +345 -0
- package/knowledge/guides/knowledge-management.md +369 -0
- package/knowledge/guides/requirement-engineering.md +329 -0
- package/knowledge/guides/stages/ai-effect-evaluator.md +93 -0
- package/knowledge/guides/stages/code-implementer.md +205 -0
- package/knowledge/guides/stages/code-reviewer.md +111 -0
- package/knowledge/guides/stages/consistency-checker.md +177 -0
- package/knowledge/guides/stages/design-planner.md +401 -0
- package/knowledge/guides/stages/design-reviewer.md +83 -0
- package/knowledge/guides/stages/integration-test-runner.md +105 -0
- package/knowledge/guides/stages/release-checker.md +205 -0
- package/knowledge/guides/stages/requirement-analyzer.md +195 -0
- package/knowledge/guides/stages/requirement-reviewer.md +83 -0
- package/knowledge/guides/stages/security-reviewer.md +89 -0
- package/knowledge/guides/stages/test-context-analyzer.md +250 -0
- package/knowledge/guides/stages/test-generator.md +241 -0
- package/knowledge/guides/stages/test-planner.md +183 -0
- package/knowledge/guides/stages/test-reviewer.md +76 -0
- package/knowledge/guides/stages/unit-test-runner.md +83 -0
- package/knowledge/guides/support/ai-agent-analyzer.md +362 -0
- package/knowledge/guides/support/ai-anomaly-analyzer.md +213 -0
- package/knowledge/guides/support/ai-artifact-evaluator.md +192 -0
- package/knowledge/guides/support/ai-capability-analyzer.md +193 -0
- package/knowledge/guides/support/ai-component-analyzer.md +169 -0
- package/knowledge/guides/support/ai-data-validator.md +276 -0
- package/knowledge/guides/support/ai-evaluation-planner.md +374 -0
- package/knowledge/guides/support/ai-path-evaluator.md +274 -0
- package/knowledge/guides/support/ai-pipeline-evaluator.md +219 -0
- package/knowledge/guides/support/ai-rag-analyzer.md +339 -0
- package/knowledge/guides/support/ai-task-assessor.md +418 -0
- package/knowledge/guides/support/ai-test-diagnostics.md +133 -0
- package/knowledge/guides/support/complexity-assessor.md +268 -0
- package/knowledge/guides/support/component-discovery.md +183 -0
- package/knowledge/guides/support/environment-scanner.md +207 -0
- package/knowledge/guides/support/environment-validator.md +207 -0
- package/knowledge/guides/support/knowledge-generator.md +234 -0
- package/knowledge/guides/support/methodology-extractor.md +55 -0
- package/knowledge/guides/support/pipeline-protocol.md +438 -0
- package/knowledge/guides/support/practice-logger.md +359 -0
- package/knowledge/guides/support/scope-inference.md +174 -0
- package/knowledge/guides/support/skill-distiller.md +91 -0
- package/knowledge/guides/support/skill-updater.md +45 -0
- package/knowledge/guides/support/skill-validator.md +72 -0
- package/knowledge/guides/support/team-orchestrator.md +323 -0
- package/knowledge/guides/support/tech-stack-analyzer.md +139 -0
- package/knowledge/guides/support/test-runner.md +254 -0
- package/knowledge/guides/system-design.md +352 -0
- package/knowledge/organization/ai-native-team.md +318 -0
- package/knowledge/organization/team-metrics.md +228 -0
- package/knowledge/principles/constitution.md +134 -0
- package/knowledge/principles/core-principles.md +368 -0
- package/knowledge/principles/design-philosophy.md +877 -0
- package/knowledge/principles/evolution.md +553 -0
- package/knowledge/process/01-requirement.md +113 -0
- package/knowledge/process/02-design.md +123 -0
- package/knowledge/process/03-implementation.md +90 -0
- package/knowledge/process/04-review.md +80 -0
- package/knowledge/process/05-testing.md +90 -0
- package/knowledge/process/06-delivery.md +88 -0
- package/knowledge/process/README.en.md +38 -0
- package/knowledge/process/README.md +48 -0
- package/knowledge/process/ai-sdlc.md +475 -0
- package/knowledge/process/overview.md +319 -0
- package/knowledge/standards/code-review.md +876 -0
- package/knowledge/standards/coding-style.md +940 -0
- package/knowledge/standards/data-consistency.md +1085 -0
- package/knowledge/standards/document-versioning.md +210 -0
- package/knowledge/standards/risk-detection.md +186 -0
- package/knowledge/templates/ai-evaluation.md +150 -0
- package/knowledge/templates/api-design.md +117 -0
- package/knowledge/templates/database-design.md +132 -0
- package/knowledge/templates/domain-driven-design.md +321 -0
- package/knowledge/templates/product-proposal.md +201 -0
- package/knowledge/templates/system-design.md +227 -0
- package/knowledge/templates/task-breakdown.md +107 -0
- package/knowledge/templates/test-case.md +170 -0
- package/package.json +53 -0
- package/plugins/.claude-plugin/plugin.json +134 -0
- package/plugins/agents/roles/ai-engineer.md +129 -0
- package/plugins/agents/roles/backend-engineer.md +165 -0
- package/plugins/agents/roles/ceo.md +94 -0
- package/plugins/agents/roles/data-engineer.md +135 -0
- package/plugins/agents/roles/devops-engineer.md +181 -0
- package/plugins/agents/roles/frontend-engineer.md +129 -0
- package/plugins/agents/roles/product-owner.md +98 -0
- package/plugins/agents/roles/quality-engineer.md +129 -0
- package/plugins/agents/roles/security-engineer.md +180 -0
- package/plugins/agents/roles/tech-lead.md +97 -0
- package/plugins/agents/support/blind-comparator.md +88 -0
- package/plugins/agents/support/consistency-checker.md +103 -0
- package/plugins/agents/support/failure-diagnostician.md +141 -0
- package/plugins/agents/support/independent-reviewer.md +80 -0
- package/plugins/agents/support/safety-auditor.md +121 -0
- package/plugins/agents/support/skill-benchmarker.md +86 -0
- package/plugins/agents/support/skill-forger.md +105 -0
- package/plugins/agents/support/stage-gate-evaluator.md +121 -0
- package/plugins/agents/support/test-coverage-reviewer.md +73 -0
- package/plugins/benchmarks/templates/README.md +44 -0
- package/plugins/benchmarks/templates/commands/explore-template.yaml +48 -0
- package/plugins/benchmarks/templates/pipeline/agile-template.yaml +84 -0
- package/plugins/benchmarks/templates/pipeline/waterfall-template.yaml +106 -0
- package/plugins/benchmarks/templates/skills/requirement-analyzer-template.yaml +48 -0
- package/plugins/commands/README.en.md +96 -0
- package/plugins/commands/README.md +96 -0
- package/plugins/commands/apply.md +191 -0
- package/plugins/commands/archive.md +76 -0
- package/plugins/commands/env-export.md +79 -0
- package/plugins/commands/env-sync.md +640 -0
- package/plugins/commands/env-template.md +223 -0
- package/plugins/commands/env-update.md +264 -0
- package/plugins/commands/env-validate.md +176 -0
- package/plugins/commands/env.md +79 -0
- package/plugins/commands/explore.md +76 -0
- package/plugins/commands/field-evolve.md +536 -0
- package/plugins/commands/memory.md +249 -0
- package/plugins/commands/project-evolve.md +821 -0
- package/plugins/commands/propose.md +93 -0
- package/plugins/commands/review.md +140 -0
- package/plugins/commands/run.md +224 -0
- package/plugins/commands/status.md +62 -0
- package/plugins/commands/validate.md +108 -0
- package/plugins/hooks/README.en.md +56 -0
- package/plugins/hooks/README.md +56 -0
- package/plugins/hooks/ai-project-guard.js +329 -0
- package/plugins/hooks/artifact-evaluation-hook.js +237 -0
- package/plugins/hooks/constitution-guard.js +211 -0
- package/plugins/hooks/environment-autocommit.js +264 -0
- package/plugins/hooks/environment-manager.js +778 -0
- package/plugins/hooks/execution-tracker.js +354 -0
- package/plugins/hooks/frozen-zone-guard.js +140 -0
- package/plugins/hooks/layer1-validator.js +423 -0
- package/plugins/hooks/lib/artifact-evaluator.js +414 -0
- package/plugins/hooks/lib/benchmarks/change-detector.js +390 -0
- package/plugins/hooks/lib/benchmarks/evaluator.js +605 -0
- package/plugins/hooks/lib/benchmarks/integration-example.js +169 -0
- package/plugins/hooks/lib/data-and-ai-detector.js +275 -0
- package/plugins/hooks/lib/detection-pattern-loader.js +865 -0
- package/plugins/hooks/lib/directory-discovery.js +395 -0
- package/plugins/hooks/lib/environment-config-loader.js +341 -0
- package/plugins/hooks/lib/environment-detector.js +553 -0
- package/plugins/hooks/lib/environment-evolver.js +564 -0
- package/plugins/hooks/lib/environment-registry.js +813 -0
- package/plugins/hooks/lib/execution-path.js +427 -0
- package/plugins/hooks/lib/hook-error-recorder.js +245 -0
- package/plugins/hooks/lib/hook-logger.js +538 -0
- package/plugins/hooks/lib/hook-runner.js +97 -0
- package/plugins/hooks/lib/hook-runner.sh +44 -0
- package/plugins/hooks/lib/hook-state-manager.js +480 -0
- package/plugins/hooks/lib/memory-extractor.js +377 -0
- package/plugins/hooks/lib/memory-manager.js +673 -0
- package/plugins/hooks/lib/metrics-analyzer.js +489 -0
- package/plugins/hooks/lib/project-evolution/auto-fixer.js +511 -0
- package/plugins/hooks/lib/project-evolution/memory-manager.js +346 -0
- package/plugins/hooks/lib/project-evolution/pattern-detector.js +476 -0
- package/plugins/hooks/lib/project-evolution/semantic-indexer.js +480 -0
- package/plugins/hooks/lib/project-structure-detector.js +326 -0
- package/plugins/hooks/lib/rollback-tracker.js +346 -0
- package/plugins/hooks/lib/source-code-scanner.js +596 -0
- package/plugins/hooks/lib/technology-stack-detector.js +374 -0
- package/plugins/hooks/lib/test-failure-analyzer.js +375 -0
- package/plugins/hooks/lib/test-failure-fixer.js +268 -0
- package/plugins/hooks/lib/trace-context.js +277 -0
- package/plugins/hooks/lib/validation-patterns.js +415 -0
- package/plugins/hooks/memory-sync.js +171 -0
- package/plugins/hooks/pipeline-observer.js +413 -0
- package/plugins/hooks/scope-sentinel.js +204 -0
- package/plugins/hooks/trace-initialization.js +169 -0
- package/plugins/memory/templates/code-quality.yaml +149 -0
- package/plugins/memory/templates/multi-system.yaml +155 -0
- package/plugins/memory/templates/team-habits.yaml +119 -0
- package/plugins/memory/templates/testing.yaml +121 -0
- package/plugins/skills/README.en.md +47 -0
- package/plugins/skills/README.md +104 -0
- package/plugins/skills/benchmark-executor/README.md +93 -0
- package/plugins/skills/benchmark-executor/SKILL.md +647 -0
- package/plugins/skills/benchmark-generator/SKILL.md +349 -0
- package/plugins/skills/delivery-stage/SKILL.md +203 -0
- package/plugins/skills/design-stage/SKILL.md +216 -0
- package/plugins/skills/evolution-process/SKILL.md +291 -0
- package/plugins/skills/exploration-phase/SKILL.md +133 -0
- package/plugins/skills/implementation-stage/SKILL.md +179 -0
- package/plugins/skills/layer1-validation/SKILL.md +79 -0
- package/plugins/skills/pending-dashboard/SKILL.md +109 -0
- package/plugins/skills/project-evolution/SKILL.md +847 -0
- package/plugins/skills/requirement-stage/SKILL.md +183 -0
- package/plugins/skills/skill-forge/SKILL.md +223 -0
- package/plugins/skills/skill-forge/references/description-guide.md +92 -0
- package/plugins/skills/skill-forge/references/quality-rubric.md +104 -0
- package/plugins/skills/skill-forge/references/skill-template.md +106 -0
- package/plugins/skills/startup-guard/SKILL.md +38 -0
- package/plugins/skills/testing-stage/SKILL.md +195 -0
- package/scripts/cli/global-init.js +288 -0
- package/scripts/cli/global.js +324 -0
- package/scripts/cli/index.js +55 -0
- package/scripts/cli/init.js +382 -0
- package/scripts/cli/list.js +69 -0
- package/scripts/cli/org.js +340 -0
- package/scripts/cli/update.js +44 -0
- package/scripts/config/commands.config.js +145 -0
- package/scripts/config/hooks.config.js +197 -0
- package/scripts/evolution/evolution-router.js +273 -0
- package/scripts/evolution/evolution-signal-collector.js +307 -0
- package/scripts/evolution/knowledge-loader.js +346 -0
- package/scripts/evolution/marketplace.js +317 -0
- package/scripts/evolution/version-manager.js +371 -0
- package/scripts/install/agents.js +106 -0
- package/scripts/install/commands.js +133 -0
- package/scripts/install/constants.js +424 -0
- package/scripts/install/hook-logger.js +536 -0
- package/scripts/install/hooks.js +110 -0
- package/scripts/install/index.js +39 -0
- package/scripts/install/skills.js +95 -0
- package/scripts/postinstall.js +25 -0
- package/scripts/state.js +376 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# AI 组件分析器
|
|
2
|
+
|
|
3
|
+
## 功能
|
|
4
|
+
|
|
5
|
+
检测项目中的 AI/ML 组件,包括模型训练、LLM 应用、Agent 框架、向量存储等。对于配置规则中未定义的组件,使用 AI 进行智能识别。
|
|
6
|
+
|
|
7
|
+
## 触发条件
|
|
8
|
+
|
|
9
|
+
- 检测到 AI 相关依赖但未在规则中定义
|
|
10
|
+
- 需要识别 RAG 架构组合
|
|
11
|
+
- 需要判断项目是否需要效果评测
|
|
12
|
+
|
|
13
|
+
## 输入
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
{
|
|
17
|
+
"projectDir": "项目根目录",
|
|
18
|
+
"dependencies": ["依赖列表"],
|
|
19
|
+
"codeFiles": [
|
|
20
|
+
{
|
|
21
|
+
"path": "文件路径",
|
|
22
|
+
"patterns": ["检测到的代码模式"]
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"directoryStructure": "目录结构概览",
|
|
26
|
+
"unknownDeps": ["未识别的 AI 相关依赖"]
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## 输出
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"components": [
|
|
35
|
+
{
|
|
36
|
+
"name": "组件名称",
|
|
37
|
+
"type": "model-training/llm-application/agent-framework/vector-store/rag-application",
|
|
38
|
+
"needsEvaluation": true,
|
|
39
|
+
"confidence": "high/medium/low",
|
|
40
|
+
"evidence": "判断依据"
|
|
41
|
+
}
|
|
42
|
+
],
|
|
43
|
+
"ragDetected": {
|
|
44
|
+
"hasVectorStore": true,
|
|
45
|
+
"hasLLM": true,
|
|
46
|
+
"isRAG": true
|
|
47
|
+
},
|
|
48
|
+
"evaluationRequired": true,
|
|
49
|
+
"suggestions": [
|
|
50
|
+
{
|
|
51
|
+
"component": "组件名",
|
|
52
|
+
"recommendation": "建议"
|
|
53
|
+
}
|
|
54
|
+
]
|
|
55
|
+
}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## 执行步骤
|
|
59
|
+
|
|
60
|
+
1. **分析依赖**
|
|
61
|
+
- 识别已知的 AI/ML 框架
|
|
62
|
+
- 分析未知的 AI 相关依赖名称
|
|
63
|
+
|
|
64
|
+
2. **代码模式分析**
|
|
65
|
+
- 扫描代码中的 AI 相关模式:
|
|
66
|
+
- `import torch` / `from transformers import`
|
|
67
|
+
- `ChatOpenAI` / `Anthropic` / `LangChain`
|
|
68
|
+
- `FAISS` / `ChromaDB` / `Pinecone`
|
|
69
|
+
- 识别 Agent 模式:`Agent`、`Tool`、`Chain`
|
|
70
|
+
|
|
71
|
+
3. **RAG 架构识别**
|
|
72
|
+
- 检测向量存储依赖
|
|
73
|
+
- 检测 LLM 依赖
|
|
74
|
+
- 判断是否为 RAG 应用
|
|
75
|
+
|
|
76
|
+
4. **评测需求判断**
|
|
77
|
+
- 模型训练项目需要评测
|
|
78
|
+
- LLM 应用需要效果评测
|
|
79
|
+
- Agent 框架需要行为评测
|
|
80
|
+
|
|
81
|
+
5. **生成分析报告**
|
|
82
|
+
- 汇总所有 AI 组件
|
|
83
|
+
- 标注评测需求
|
|
84
|
+
- 提供配置更新建议
|
|
85
|
+
|
|
86
|
+
## AI 组件类型
|
|
87
|
+
|
|
88
|
+
| 类型 | 关键依赖 | 需要评测 |
|
|
89
|
+
|------|---------|---------|
|
|
90
|
+
| model-training | torch, tensorflow, jax | 是 |
|
|
91
|
+
| inference-service | vllm, tgi, triton | 是 |
|
|
92
|
+
| llm-application | langchain, openai, anthropic | 是 |
|
|
93
|
+
| agent-framework | crewai, autogen, metagpt | 是 |
|
|
94
|
+
| vector-store | faiss, chromadb, pinecone | 否 |
|
|
95
|
+
| rag-application | vector-store + llm | 是 |
|
|
96
|
+
| evaluation | mlflow, wandb, evaluate | 否 |
|
|
97
|
+
|
|
98
|
+
## 规则
|
|
99
|
+
|
|
100
|
+
1. **组合检测优先**
|
|
101
|
+
- RAG 应用需要同时有向量存储和 LLM
|
|
102
|
+
- Agent 需要有 LLM + 工具调用
|
|
103
|
+
|
|
104
|
+
2. **评测需求判断**
|
|
105
|
+
- 涉及模型输出的组件需要评测
|
|
106
|
+
- 纯向量存储不需要评测
|
|
107
|
+
|
|
108
|
+
3. **置信度评估**
|
|
109
|
+
- high: 配置规则匹配
|
|
110
|
+
- medium: 代码模式匹配
|
|
111
|
+
- low: 仅依赖名称推测
|
|
112
|
+
|
|
113
|
+
## 示例
|
|
114
|
+
|
|
115
|
+
### 输入
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
{
|
|
119
|
+
"dependencies": ["langchain", "openai", "chromadb", "tiktoken"],
|
|
120
|
+
"codeFiles": [
|
|
121
|
+
{
|
|
122
|
+
"path": "src/rag.py",
|
|
123
|
+
"patterns": ["Chroma.from_documents", "ChatOpenAI"]
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 输出
|
|
130
|
+
|
|
131
|
+
```json
|
|
132
|
+
{
|
|
133
|
+
"components": [
|
|
134
|
+
{
|
|
135
|
+
"name": "LLM应用",
|
|
136
|
+
"type": "llm-application",
|
|
137
|
+
"needsEvaluation": true,
|
|
138
|
+
"confidence": "high",
|
|
139
|
+
"evidence": "检测到 langchain 和 openai 依赖"
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"name": "向量存储",
|
|
143
|
+
"type": "vector-store",
|
|
144
|
+
"needsEvaluation": false,
|
|
145
|
+
"confidence": "high",
|
|
146
|
+
"evidence": "检测到 chromadb 依赖"
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
"name": "RAG应用",
|
|
150
|
+
"type": "rag-application",
|
|
151
|
+
"needsEvaluation": true,
|
|
152
|
+
"confidence": "high",
|
|
153
|
+
"evidence": "同时检测到向量存储和 LLM,代码中有 RAG 模式"
|
|
154
|
+
}
|
|
155
|
+
],
|
|
156
|
+
"ragDetected": {
|
|
157
|
+
"hasVectorStore": true,
|
|
158
|
+
"hasLLM": true,
|
|
159
|
+
"isRAG": true
|
|
160
|
+
},
|
|
161
|
+
"evaluationRequired": true,
|
|
162
|
+
"suggestions": [
|
|
163
|
+
{
|
|
164
|
+
"component": "rag-application",
|
|
165
|
+
"recommendation": "建议创建评测数据集测试检索和生成质量"
|
|
166
|
+
}
|
|
167
|
+
]
|
|
168
|
+
}
|
|
169
|
+
```
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-validator
|
|
3
|
+
description: 当项目包含数据开发组件(ETL、数据管道、数据仓库等)时,分析数据质量、验证数据完整性、提供数据分析支持。用于数据开发的稳定、安全、效果保障。
|
|
4
|
+
type: ai
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 定位
|
|
8
|
+
|
|
9
|
+
AI专用技能。为数据开发提供质量验证、分析和挖掘支持,参考AI评测方法论到数据领域。
|
|
10
|
+
|
|
11
|
+
## 输入
|
|
12
|
+
|
|
13
|
+
- 必须输入:数据代码路径或项目目录、数据源配置
|
|
14
|
+
- 可选输入:数据质量要求、验证规则
|
|
15
|
+
|
|
16
|
+
## 输出
|
|
17
|
+
|
|
18
|
+
```json
|
|
19
|
+
{
|
|
20
|
+
"dataSystem": {
|
|
21
|
+
"type": "etl-pipeline",
|
|
22
|
+
"components": ["data-source", "transformation", "data-warehouse"],
|
|
23
|
+
"qualityRequirements": ["completeness", "accuracy", "timeliness"]
|
|
24
|
+
},
|
|
25
|
+
"validations": [
|
|
26
|
+
{
|
|
27
|
+
"dimension": "completeness",
|
|
28
|
+
"checks": ["null_check", "missing_value_check"],
|
|
29
|
+
"thresholds": {"null_rate": 0.05}
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"dimension": "accuracy",
|
|
33
|
+
"checks": ["range_check", "format_check"],
|
|
34
|
+
"thresholds": {"error_rate": 0.01}
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"dimension": "timeliness",
|
|
38
|
+
"checks": ["freshness_check", "delay_check"],
|
|
39
|
+
"thresholds": {"max_delay_hours": 24}
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
"analysisSupport": {
|
|
43
|
+
"dataProfile": true,
|
|
44
|
+
"anomalyDetection": true,
|
|
45
|
+
"trendAnalysis": false
|
|
46
|
+
},
|
|
47
|
+
"tools": {
|
|
48
|
+
"suggested": ["great-expectations", "pandas-profiling", "dbt"],
|
|
49
|
+
"reason": "这些工具支持数据质量验证和分析"
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 执行步骤
|
|
55
|
+
|
|
56
|
+
### Step 1: 识别数据系统类型(确定性)
|
|
57
|
+
|
|
58
|
+
基于依赖和代码特征识别数据系统:
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
数据系统识别规则:
|
|
62
|
+
- dbt: 依赖dbt,代码中有 dbt run, dbt test
|
|
63
|
+
- airflow: 依赖airflow,代码中有 DAG, Task
|
|
64
|
+
- spark: 依赖pyspark,代码中有 SparkSession, DataFrame
|
|
65
|
+
- pandas: 依赖pandas,代码中有 read_csv, read_excel
|
|
66
|
+
- etl: 自定义ETL实现
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Step 2: Grep获取数据处理逻辑(高效)
|
|
70
|
+
|
|
71
|
+
搜索数据处理模式:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# 数据读取
|
|
75
|
+
grep -r -n -A 5 "read_csv\|read_excel\|load\|fetch" --include="*.py" .
|
|
76
|
+
|
|
77
|
+
# 数据转换
|
|
78
|
+
grep -r -n -A 5 "transform\|aggregate\|join\|filter" --include="*.py" .
|
|
79
|
+
|
|
80
|
+
# 数据写入
|
|
81
|
+
grep -r -n -A 5 "to_csv\|write\|save\|export" --include="*.py" .
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Step 3: 分析数据质量要求(模型)
|
|
85
|
+
|
|
86
|
+
基于代码上下文分析数据质量要求:
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
关注点:
|
|
90
|
+
- 数据源类型和可靠性
|
|
91
|
+
- 数据转换逻辑
|
|
92
|
+
- 数据质量检查点
|
|
93
|
+
- 错误处理机制
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Step 4: 设计验证方案(模型)
|
|
97
|
+
|
|
98
|
+
设计数据质量验证方案:
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
验证维度:
|
|
102
|
+
1. 完整性:空值、缺失记录、重复数据
|
|
103
|
+
2. 准确性:格式、范围、一致性
|
|
104
|
+
3. 时效性:更新频率、延迟
|
|
105
|
+
4. 唯一性:主键重复、外键关联
|
|
106
|
+
5. 合法性:业务规则、数据约束
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Step 5: 输出结果
|
|
110
|
+
|
|
111
|
+
汇总数据验证方案,包括验证规则、工具建议和实施计划。
|
|
112
|
+
|
|
113
|
+
## 数据质量维度
|
|
114
|
+
|
|
115
|
+
### 1. 完整性 (Completeness)
|
|
116
|
+
|
|
117
|
+
| 检查项 | 指标 | 方法 |
|
|
118
|
+
|--------|------|------|
|
|
119
|
+
| 空值检查 | null_rate | 统计null/空字符串比例 |
|
|
120
|
+
| 缺失记录 | missing_count | 对比预期记录数 |
|
|
121
|
+
| 重复数据 | duplicate_rate | 检查主键重复 |
|
|
122
|
+
|
|
123
|
+
### 2. 准确性 (Accuracy)
|
|
124
|
+
|
|
125
|
+
| 检查项 | 指标 | 方法 |
|
|
126
|
+
|--------|------|------|
|
|
127
|
+
| 格式检查 | format_error_rate | 正则表达式验证 |
|
|
128
|
+
| 范围检查 | range_error_rate | 数值范围验证 |
|
|
129
|
+
| 一致性检查 | consistency_rate | 跨表一致性验证 |
|
|
130
|
+
|
|
131
|
+
### 3. 时效性 (Timeliness)
|
|
132
|
+
|
|
133
|
+
| 检查项 | 指标 | 方法 |
|
|
134
|
+
|--------|------|------|
|
|
135
|
+
| 新鲜度 | data_freshness | 数据更新时间 |
|
|
136
|
+
| 延迟 | delay_hours | 数据延迟时间 |
|
|
137
|
+
| 调度 | schedule_adherence | 调度执行情况 |
|
|
138
|
+
|
|
139
|
+
### 4. 唯一性 (Uniqueness)
|
|
140
|
+
|
|
141
|
+
| 检查项 | 指标 | 方法 |
|
|
142
|
+
|--------|------|------|
|
|
143
|
+
| 主键重复 | pk_duplicate_rate | 主键唯一性 |
|
|
144
|
+
| 外键关联 | fk_violation_rate | 外键完整性 |
|
|
145
|
+
|
|
146
|
+
### 5. 合法性 (Validity)
|
|
147
|
+
|
|
148
|
+
| 检查项 | 指标 | 方法 |
|
|
149
|
+
|--------|------|------|
|
|
150
|
+
| 业务规则 | business_rule_violation | 业务规则验证 |
|
|
151
|
+
| 数据约束 | constraint_violation | 约束检查 |
|
|
152
|
+
|
|
153
|
+
## 数据分析支持
|
|
154
|
+
|
|
155
|
+
### 数据分析维度
|
|
156
|
+
|
|
157
|
+
| 维度 | 说明 | 工具 |
|
|
158
|
+
|------|------|------|
|
|
159
|
+
| **数据画像** | 数据的基本统计信息 | pandas-profiling, sweetviz |
|
|
160
|
+
| **异常检测** | 识别异常数据点 | isolation-forest, z-score |
|
|
161
|
+
| **趋势分析** | 数据随时间变化趋势 | pandas, matplotlib |
|
|
162
|
+
| **相关性分析** | 变量间相关关系 | pandas, seaborn |
|
|
163
|
+
| **分布分析** | 数据分布特征 | histogram, boxplot |
|
|
164
|
+
|
|
165
|
+
### 数据挖掘支持
|
|
166
|
+
|
|
167
|
+
| 任务 | 说明 | 适用场景 |
|
|
168
|
+
|------|------|----------|
|
|
169
|
+
| **特征工程** | 特征提取和转换 | 机器学习准备 |
|
|
170
|
+
| **数据标注** | 标注数据生成 | AI模型训练 |
|
|
171
|
+
| **数据增强** | 数据扩充 | 样本不足 |
|
|
172
|
+
| **数据清洗** | 脏数据处理 | 质量提升 |
|
|
173
|
+
|
|
174
|
+
## 数据验证工具
|
|
175
|
+
|
|
176
|
+
### 开源工具
|
|
177
|
+
|
|
178
|
+
| 工具 | 说明 | 特点 |
|
|
179
|
+
|------|------|------|
|
|
180
|
+
| **Great Expectations** | 数据质量验证 | 声明式验证规则 |
|
|
181
|
+
| **Pandas Profiling** | 数据画像 | 自动生成报告 |
|
|
182
|
+
| **Dbt** | 数据转换 | SQL式ETL |
|
|
183
|
+
| **Great Expectations** | 数据测试 | 集成测试 |
|
|
184
|
+
| **Soda** | 数据监控 | 主动监控 |
|
|
185
|
+
|
|
186
|
+
### 验证规则模板
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
# 完整性验证
|
|
190
|
+
expect_column_values_to_not_be_null("email")
|
|
191
|
+
expect_column_value_lengths_to_be_between("phone", 10, 15)
|
|
192
|
+
expect_table_row_count_to_be_between("orders", 1000, 1000000)
|
|
193
|
+
|
|
194
|
+
# 准确性验证
|
|
195
|
+
expect_column_values_to_match_regex("email", ".*@.*\\..*")
|
|
196
|
+
expect_column_values_to_be_between("price", 0, 10000)
|
|
197
|
+
expect_column_values_to_be_in_set("status", ["active", "inactive"])
|
|
198
|
+
|
|
199
|
+
# 唯一性验证
|
|
200
|
+
expect_column_values_to_be_unique("user_id")
|
|
201
|
+
expect_compound_columns_to_be_unique(["order_id", "line_item_id"])
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## 调用时机
|
|
205
|
+
|
|
206
|
+
- 检测到数据系统组件时
|
|
207
|
+
- 数据管道开发阶段
|
|
208
|
+
- 数据质量保障需求
|
|
209
|
+
- 数据分析需求
|
|
210
|
+
|
|
211
|
+
## 示例
|
|
212
|
+
|
|
213
|
+
**输入**:
|
|
214
|
+
```
|
|
215
|
+
项目目录:/project
|
|
216
|
+
数据系统:dbt + snowflake
|
|
217
|
+
数据源:用户行为数据
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Grep搜索**:
|
|
221
|
+
```
|
|
222
|
+
找到文件:models/users.sql, tests/schema_test.yml
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
**代码上下文**:
|
|
226
|
+
```sql
|
|
227
|
+
-- models/users.sql
|
|
228
|
+
SELECT
|
|
229
|
+
user_id,
|
|
230
|
+
email,
|
|
231
|
+
created_at,
|
|
232
|
+
status
|
|
233
|
+
FROM raw_users
|
|
234
|
+
WHERE status = 'active'
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
**输出**:
|
|
238
|
+
```json
|
|
239
|
+
{
|
|
240
|
+
"dataSystem": {
|
|
241
|
+
"type": "dbt-pipeline",
|
|
242
|
+
"components": ["raw_users", "users_model"],
|
|
243
|
+
"qualityRequirements": ["completeness", "accuracy"]
|
|
244
|
+
},
|
|
245
|
+
"validations": [
|
|
246
|
+
{
|
|
247
|
+
"dimension": "completeness",
|
|
248
|
+
"checks": ["not_null", "unique"],
|
|
249
|
+
"fields": ["user_id", "email"],
|
|
250
|
+
"thresholds": {"null_rate": 0}
|
|
251
|
+
},
|
|
252
|
+
{
|
|
253
|
+
"dimension": "accuracy",
|
|
254
|
+
"checks": ["valid_email", "valid_status"],
|
|
255
|
+
"fields": ["email", "status"],
|
|
256
|
+
"thresholds": {"error_rate": 0.01}
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
"tools": {
|
|
260
|
+
"suggested": ["dbt-utils", "great-expectations"],
|
|
261
|
+
"reason": "与dbt集成良好"
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## 与AI评测的类比
|
|
267
|
+
|
|
268
|
+
将AI评测方法论应用到数据开发:
|
|
269
|
+
|
|
270
|
+
| AI评测 | 数据开发 | 说明 |
|
|
271
|
+
|--------|----------|------|
|
|
272
|
+
| 任务完成率 | 数据完整率 | 数据是否完整 |
|
|
273
|
+
| 工具使用正确性 | 转换逻辑正确性 | ETL逻辑是否正确 |
|
|
274
|
+
| 响应质量 | 数据准确性 | 数据是否准确 |
|
|
275
|
+
| 基础设施噪声 | 数据源波动 | 源数据质量问题 |
|
|
276
|
+
| AI-resistant评估 | 数据一致性验证 | 防止数据泄露 |
|