@chongyan/autospec 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (243) hide show
  1. package/LICENSE +21 -0
  2. package/README.en.md +472 -0
  3. package/README.md +476 -0
  4. package/bin/autospec.js +3 -0
  5. package/knowledge/README.md +144 -0
  6. package/knowledge/checklists/code.md +182 -0
  7. package/knowledge/checklists/design.md +196 -0
  8. package/knowledge/checklists/release.md +70 -0
  9. package/knowledge/checklists/requirement.md +169 -0
  10. package/knowledge/checklists/test.md +46 -0
  11. package/knowledge/config/README.en.md +44 -0
  12. package/knowledge/config/README.md +44 -0
  13. package/knowledge/config/role-composition.yaml +98 -0
  14. package/knowledge/config/role-extensions.yaml +140 -0
  15. package/knowledge/config/skill-compositions.yaml +142 -0
  16. package/knowledge/config/team-stage.yaml +95 -0
  17. package/knowledge/config/team-tasks.yaml +139 -0
  18. package/knowledge/config/team-triggers.yaml +198 -0
  19. package/knowledge/config/validation-patterns.yaml +137 -0
  20. package/knowledge/domain/README.md +115 -0
  21. package/knowledge/domain/flows/README.md +194 -0
  22. package/knowledge/domain/glossary.md +143 -0
  23. package/knowledge/domain/rules.md +138 -0
  24. package/knowledge/environment/README.en.md +36 -0
  25. package/knowledge/environment/README.md +87 -0
  26. package/knowledge/environment/component-knowledge.md +316 -0
  27. package/knowledge/environment/detection-patterns.yaml +502 -0
  28. package/knowledge/environment/middleware-knowledge.md +237 -0
  29. package/knowledge/environment/template-registry.md +321 -0
  30. package/knowledge/guides/domain-driven-design.md +345 -0
  31. package/knowledge/guides/knowledge-management.md +369 -0
  32. package/knowledge/guides/requirement-engineering.md +329 -0
  33. package/knowledge/guides/stages/ai-effect-evaluator.md +93 -0
  34. package/knowledge/guides/stages/code-implementer.md +205 -0
  35. package/knowledge/guides/stages/code-reviewer.md +111 -0
  36. package/knowledge/guides/stages/consistency-checker.md +177 -0
  37. package/knowledge/guides/stages/design-planner.md +401 -0
  38. package/knowledge/guides/stages/design-reviewer.md +83 -0
  39. package/knowledge/guides/stages/integration-test-runner.md +105 -0
  40. package/knowledge/guides/stages/release-checker.md +205 -0
  41. package/knowledge/guides/stages/requirement-analyzer.md +195 -0
  42. package/knowledge/guides/stages/requirement-reviewer.md +83 -0
  43. package/knowledge/guides/stages/security-reviewer.md +89 -0
  44. package/knowledge/guides/stages/test-context-analyzer.md +250 -0
  45. package/knowledge/guides/stages/test-generator.md +241 -0
  46. package/knowledge/guides/stages/test-planner.md +183 -0
  47. package/knowledge/guides/stages/test-reviewer.md +76 -0
  48. package/knowledge/guides/stages/unit-test-runner.md +83 -0
  49. package/knowledge/guides/support/ai-agent-analyzer.md +362 -0
  50. package/knowledge/guides/support/ai-anomaly-analyzer.md +213 -0
  51. package/knowledge/guides/support/ai-artifact-evaluator.md +192 -0
  52. package/knowledge/guides/support/ai-capability-analyzer.md +193 -0
  53. package/knowledge/guides/support/ai-component-analyzer.md +169 -0
  54. package/knowledge/guides/support/ai-data-validator.md +276 -0
  55. package/knowledge/guides/support/ai-evaluation-planner.md +374 -0
  56. package/knowledge/guides/support/ai-path-evaluator.md +274 -0
  57. package/knowledge/guides/support/ai-pipeline-evaluator.md +219 -0
  58. package/knowledge/guides/support/ai-rag-analyzer.md +339 -0
  59. package/knowledge/guides/support/ai-task-assessor.md +418 -0
  60. package/knowledge/guides/support/ai-test-diagnostics.md +133 -0
  61. package/knowledge/guides/support/complexity-assessor.md +268 -0
  62. package/knowledge/guides/support/component-discovery.md +183 -0
  63. package/knowledge/guides/support/environment-scanner.md +207 -0
  64. package/knowledge/guides/support/environment-validator.md +207 -0
  65. package/knowledge/guides/support/knowledge-generator.md +234 -0
  66. package/knowledge/guides/support/methodology-extractor.md +55 -0
  67. package/knowledge/guides/support/pipeline-protocol.md +438 -0
  68. package/knowledge/guides/support/practice-logger.md +359 -0
  69. package/knowledge/guides/support/scope-inference.md +174 -0
  70. package/knowledge/guides/support/skill-distiller.md +91 -0
  71. package/knowledge/guides/support/skill-updater.md +45 -0
  72. package/knowledge/guides/support/skill-validator.md +72 -0
  73. package/knowledge/guides/support/team-orchestrator.md +323 -0
  74. package/knowledge/guides/support/tech-stack-analyzer.md +139 -0
  75. package/knowledge/guides/support/test-runner.md +254 -0
  76. package/knowledge/guides/system-design.md +352 -0
  77. package/knowledge/organization/ai-native-team.md +318 -0
  78. package/knowledge/organization/team-metrics.md +228 -0
  79. package/knowledge/principles/constitution.md +134 -0
  80. package/knowledge/principles/core-principles.md +368 -0
  81. package/knowledge/principles/design-philosophy.md +877 -0
  82. package/knowledge/principles/evolution.md +553 -0
  83. package/knowledge/process/01-requirement.md +113 -0
  84. package/knowledge/process/02-design.md +123 -0
  85. package/knowledge/process/03-implementation.md +90 -0
  86. package/knowledge/process/04-review.md +80 -0
  87. package/knowledge/process/05-testing.md +90 -0
  88. package/knowledge/process/06-delivery.md +88 -0
  89. package/knowledge/process/README.en.md +38 -0
  90. package/knowledge/process/README.md +48 -0
  91. package/knowledge/process/ai-sdlc.md +475 -0
  92. package/knowledge/process/overview.md +319 -0
  93. package/knowledge/standards/code-review.md +876 -0
  94. package/knowledge/standards/coding-style.md +940 -0
  95. package/knowledge/standards/data-consistency.md +1085 -0
  96. package/knowledge/standards/document-versioning.md +210 -0
  97. package/knowledge/standards/risk-detection.md +186 -0
  98. package/knowledge/templates/ai-evaluation.md +150 -0
  99. package/knowledge/templates/api-design.md +117 -0
  100. package/knowledge/templates/database-design.md +132 -0
  101. package/knowledge/templates/domain-driven-design.md +321 -0
  102. package/knowledge/templates/product-proposal.md +201 -0
  103. package/knowledge/templates/system-design.md +227 -0
  104. package/knowledge/templates/task-breakdown.md +107 -0
  105. package/knowledge/templates/test-case.md +170 -0
  106. package/package.json +53 -0
  107. package/plugins/.claude-plugin/plugin.json +134 -0
  108. package/plugins/agents/roles/ai-engineer.md +129 -0
  109. package/plugins/agents/roles/backend-engineer.md +165 -0
  110. package/plugins/agents/roles/ceo.md +94 -0
  111. package/plugins/agents/roles/data-engineer.md +135 -0
  112. package/plugins/agents/roles/devops-engineer.md +181 -0
  113. package/plugins/agents/roles/frontend-engineer.md +129 -0
  114. package/plugins/agents/roles/product-owner.md +98 -0
  115. package/plugins/agents/roles/quality-engineer.md +129 -0
  116. package/plugins/agents/roles/security-engineer.md +180 -0
  117. package/plugins/agents/roles/tech-lead.md +97 -0
  118. package/plugins/agents/support/blind-comparator.md +88 -0
  119. package/plugins/agents/support/consistency-checker.md +103 -0
  120. package/plugins/agents/support/failure-diagnostician.md +141 -0
  121. package/plugins/agents/support/independent-reviewer.md +80 -0
  122. package/plugins/agents/support/safety-auditor.md +121 -0
  123. package/plugins/agents/support/skill-benchmarker.md +86 -0
  124. package/plugins/agents/support/skill-forger.md +105 -0
  125. package/plugins/agents/support/stage-gate-evaluator.md +121 -0
  126. package/plugins/agents/support/test-coverage-reviewer.md +73 -0
  127. package/plugins/benchmarks/templates/README.md +44 -0
  128. package/plugins/benchmarks/templates/commands/explore-template.yaml +48 -0
  129. package/plugins/benchmarks/templates/pipeline/agile-template.yaml +84 -0
  130. package/plugins/benchmarks/templates/pipeline/waterfall-template.yaml +106 -0
  131. package/plugins/benchmarks/templates/skills/requirement-analyzer-template.yaml +48 -0
  132. package/plugins/commands/README.en.md +96 -0
  133. package/plugins/commands/README.md +96 -0
  134. package/plugins/commands/apply.md +191 -0
  135. package/plugins/commands/archive.md +76 -0
  136. package/plugins/commands/env-export.md +79 -0
  137. package/plugins/commands/env-sync.md +640 -0
  138. package/plugins/commands/env-template.md +223 -0
  139. package/plugins/commands/env-update.md +264 -0
  140. package/plugins/commands/env-validate.md +176 -0
  141. package/plugins/commands/env.md +79 -0
  142. package/plugins/commands/explore.md +76 -0
  143. package/plugins/commands/field-evolve.md +536 -0
  144. package/plugins/commands/memory.md +249 -0
  145. package/plugins/commands/project-evolve.md +821 -0
  146. package/plugins/commands/propose.md +93 -0
  147. package/plugins/commands/review.md +140 -0
  148. package/plugins/commands/run.md +224 -0
  149. package/plugins/commands/status.md +62 -0
  150. package/plugins/commands/validate.md +108 -0
  151. package/plugins/hooks/README.en.md +56 -0
  152. package/plugins/hooks/README.md +56 -0
  153. package/plugins/hooks/ai-project-guard.js +329 -0
  154. package/plugins/hooks/artifact-evaluation-hook.js +237 -0
  155. package/plugins/hooks/constitution-guard.js +211 -0
  156. package/plugins/hooks/environment-autocommit.js +264 -0
  157. package/plugins/hooks/environment-manager.js +778 -0
  158. package/plugins/hooks/execution-tracker.js +354 -0
  159. package/plugins/hooks/frozen-zone-guard.js +140 -0
  160. package/plugins/hooks/layer1-validator.js +423 -0
  161. package/plugins/hooks/lib/artifact-evaluator.js +414 -0
  162. package/plugins/hooks/lib/benchmarks/change-detector.js +390 -0
  163. package/plugins/hooks/lib/benchmarks/evaluator.js +605 -0
  164. package/plugins/hooks/lib/benchmarks/integration-example.js +169 -0
  165. package/plugins/hooks/lib/data-and-ai-detector.js +275 -0
  166. package/plugins/hooks/lib/detection-pattern-loader.js +865 -0
  167. package/plugins/hooks/lib/directory-discovery.js +395 -0
  168. package/plugins/hooks/lib/environment-config-loader.js +341 -0
  169. package/plugins/hooks/lib/environment-detector.js +553 -0
  170. package/plugins/hooks/lib/environment-evolver.js +564 -0
  171. package/plugins/hooks/lib/environment-registry.js +813 -0
  172. package/plugins/hooks/lib/execution-path.js +427 -0
  173. package/plugins/hooks/lib/hook-error-recorder.js +245 -0
  174. package/plugins/hooks/lib/hook-logger.js +538 -0
  175. package/plugins/hooks/lib/hook-runner.js +97 -0
  176. package/plugins/hooks/lib/hook-runner.sh +44 -0
  177. package/plugins/hooks/lib/hook-state-manager.js +480 -0
  178. package/plugins/hooks/lib/memory-extractor.js +377 -0
  179. package/plugins/hooks/lib/memory-manager.js +673 -0
  180. package/plugins/hooks/lib/metrics-analyzer.js +489 -0
  181. package/plugins/hooks/lib/project-evolution/auto-fixer.js +511 -0
  182. package/plugins/hooks/lib/project-evolution/memory-manager.js +346 -0
  183. package/plugins/hooks/lib/project-evolution/pattern-detector.js +476 -0
  184. package/plugins/hooks/lib/project-evolution/semantic-indexer.js +480 -0
  185. package/plugins/hooks/lib/project-structure-detector.js +326 -0
  186. package/plugins/hooks/lib/rollback-tracker.js +346 -0
  187. package/plugins/hooks/lib/source-code-scanner.js +596 -0
  188. package/plugins/hooks/lib/technology-stack-detector.js +374 -0
  189. package/plugins/hooks/lib/test-failure-analyzer.js +375 -0
  190. package/plugins/hooks/lib/test-failure-fixer.js +268 -0
  191. package/plugins/hooks/lib/trace-context.js +277 -0
  192. package/plugins/hooks/lib/validation-patterns.js +415 -0
  193. package/plugins/hooks/memory-sync.js +171 -0
  194. package/plugins/hooks/pipeline-observer.js +413 -0
  195. package/plugins/hooks/scope-sentinel.js +204 -0
  196. package/plugins/hooks/trace-initialization.js +169 -0
  197. package/plugins/memory/templates/code-quality.yaml +149 -0
  198. package/plugins/memory/templates/multi-system.yaml +155 -0
  199. package/plugins/memory/templates/team-habits.yaml +119 -0
  200. package/plugins/memory/templates/testing.yaml +121 -0
  201. package/plugins/skills/README.en.md +47 -0
  202. package/plugins/skills/README.md +104 -0
  203. package/plugins/skills/benchmark-executor/README.md +93 -0
  204. package/plugins/skills/benchmark-executor/SKILL.md +647 -0
  205. package/plugins/skills/benchmark-generator/SKILL.md +349 -0
  206. package/plugins/skills/delivery-stage/SKILL.md +203 -0
  207. package/plugins/skills/design-stage/SKILL.md +216 -0
  208. package/plugins/skills/evolution-process/SKILL.md +291 -0
  209. package/plugins/skills/exploration-phase/SKILL.md +133 -0
  210. package/plugins/skills/implementation-stage/SKILL.md +179 -0
  211. package/plugins/skills/layer1-validation/SKILL.md +79 -0
  212. package/plugins/skills/pending-dashboard/SKILL.md +109 -0
  213. package/plugins/skills/project-evolution/SKILL.md +847 -0
  214. package/plugins/skills/requirement-stage/SKILL.md +183 -0
  215. package/plugins/skills/skill-forge/SKILL.md +223 -0
  216. package/plugins/skills/skill-forge/references/description-guide.md +92 -0
  217. package/plugins/skills/skill-forge/references/quality-rubric.md +104 -0
  218. package/plugins/skills/skill-forge/references/skill-template.md +106 -0
  219. package/plugins/skills/startup-guard/SKILL.md +38 -0
  220. package/plugins/skills/testing-stage/SKILL.md +195 -0
  221. package/scripts/cli/global-init.js +288 -0
  222. package/scripts/cli/global.js +324 -0
  223. package/scripts/cli/index.js +55 -0
  224. package/scripts/cli/init.js +382 -0
  225. package/scripts/cli/list.js +69 -0
  226. package/scripts/cli/org.js +340 -0
  227. package/scripts/cli/update.js +44 -0
  228. package/scripts/config/commands.config.js +145 -0
  229. package/scripts/config/hooks.config.js +197 -0
  230. package/scripts/evolution/evolution-router.js +273 -0
  231. package/scripts/evolution/evolution-signal-collector.js +307 -0
  232. package/scripts/evolution/knowledge-loader.js +346 -0
  233. package/scripts/evolution/marketplace.js +317 -0
  234. package/scripts/evolution/version-manager.js +371 -0
  235. package/scripts/install/agents.js +106 -0
  236. package/scripts/install/commands.js +133 -0
  237. package/scripts/install/constants.js +424 -0
  238. package/scripts/install/hook-logger.js +536 -0
  239. package/scripts/install/hooks.js +110 -0
  240. package/scripts/install/index.js +39 -0
  241. package/scripts/install/skills.js +95 -0
  242. package/scripts/postinstall.js +25 -0
  243. package/scripts/state.js +376 -0
@@ -0,0 +1,169 @@
1
+ # AI 组件分析器
2
+
3
+ ## 功能
4
+
5
+ 检测项目中的 AI/ML 组件,包括模型训练、LLM 应用、Agent 框架、向量存储等。对于配置规则中未定义的组件,使用 AI 进行智能识别。
6
+
7
+ ## 触发条件
8
+
9
+ - 检测到 AI 相关依赖但未在规则中定义
10
+ - 需要识别 RAG 架构组合
11
+ - 需要判断项目是否需要效果评测
12
+
13
+ ## 输入
14
+
15
+ ```json
16
+ {
17
+ "projectDir": "项目根目录",
18
+ "dependencies": ["依赖列表"],
19
+ "codeFiles": [
20
+ {
21
+ "path": "文件路径",
22
+ "patterns": ["检测到的代码模式"]
23
+ }
24
+ ],
25
+ "directoryStructure": "目录结构概览",
26
+ "unknownDeps": ["未识别的 AI 相关依赖"]
27
+ }
28
+ ```
29
+
30
+ ## 输出
31
+
32
+ ```json
33
+ {
34
+ "components": [
35
+ {
36
+ "name": "组件名称",
37
+ "type": "model-training/llm-application/agent-framework/vector-store/rag-application",
38
+ "needsEvaluation": true,
39
+ "confidence": "high/medium/low",
40
+ "evidence": "判断依据"
41
+ }
42
+ ],
43
+ "ragDetected": {
44
+ "hasVectorStore": true,
45
+ "hasLLM": true,
46
+ "isRAG": true
47
+ },
48
+ "evaluationRequired": true,
49
+ "suggestions": [
50
+ {
51
+ "component": "组件名",
52
+ "recommendation": "建议"
53
+ }
54
+ ]
55
+ }
56
+ ```
57
+
58
+ ## 执行步骤
59
+
60
+ 1. **分析依赖**
61
+ - 识别已知的 AI/ML 框架
62
+ - 分析未知的 AI 相关依赖名称
63
+
64
+ 2. **代码模式分析**
65
+ - 扫描代码中的 AI 相关模式:
66
+ - `import torch` / `from transformers import`
67
+ - `ChatOpenAI` / `Anthropic` / `LangChain`
68
+ - `FAISS` / `ChromaDB` / `Pinecone`
69
+ - 识别 Agent 模式:`Agent`、`Tool`、`Chain`
70
+
71
+ 3. **RAG 架构识别**
72
+ - 检测向量存储依赖
73
+ - 检测 LLM 依赖
74
+ - 判断是否为 RAG 应用
75
+
76
+ 4. **评测需求判断**
77
+ - 模型训练项目需要评测
78
+ - LLM 应用需要效果评测
79
+ - Agent 框架需要行为评测
80
+
81
+ 5. **生成分析报告**
82
+ - 汇总所有 AI 组件
83
+ - 标注评测需求
84
+ - 提供配置更新建议
85
+
86
+ ## AI 组件类型
87
+
88
+ | 类型 | 关键依赖 | 需要评测 |
89
+ |------|---------|---------|
90
+ | model-training | torch, tensorflow, jax | 是 |
91
+ | inference-service | vllm, tgi, triton | 是 |
92
+ | llm-application | langchain, openai, anthropic | 是 |
93
+ | agent-framework | crewai, autogen, metagpt | 是 |
94
+ | vector-store | faiss, chromadb, pinecone | 否 |
95
+ | rag-application | vector-store + llm | 是 |
96
+ | evaluation | mlflow, wandb, evaluate | 否 |
97
+
98
+ ## 规则
99
+
100
+ 1. **组合检测优先**
101
+ - RAG 应用需要同时有向量存储和 LLM
102
+ - Agent 需要有 LLM + 工具调用
103
+
104
+ 2. **评测需求判断**
105
+ - 涉及模型输出的组件需要评测
106
+ - 纯向量存储不需要评测
107
+
108
+ 3. **置信度评估**
109
+ - high: 配置规则匹配
110
+ - medium: 代码模式匹配
111
+ - low: 仅依赖名称推测
112
+
113
+ ## 示例
114
+
115
+ ### 输入
116
+
117
+ ```json
118
+ {
119
+ "dependencies": ["langchain", "openai", "chromadb", "tiktoken"],
120
+ "codeFiles": [
121
+ {
122
+ "path": "src/rag.py",
123
+ "patterns": ["Chroma.from_documents", "ChatOpenAI"]
124
+ }
125
+ ]
126
+ }
127
+ ```
128
+
129
+ ### 输出
130
+
131
+ ```json
132
+ {
133
+ "components": [
134
+ {
135
+ "name": "LLM应用",
136
+ "type": "llm-application",
137
+ "needsEvaluation": true,
138
+ "confidence": "high",
139
+ "evidence": "检测到 langchain 和 openai 依赖"
140
+ },
141
+ {
142
+ "name": "向量存储",
143
+ "type": "vector-store",
144
+ "needsEvaluation": false,
145
+ "confidence": "high",
146
+ "evidence": "检测到 chromadb 依赖"
147
+ },
148
+ {
149
+ "name": "RAG应用",
150
+ "type": "rag-application",
151
+ "needsEvaluation": true,
152
+ "confidence": "high",
153
+ "evidence": "同时检测到向量存储和 LLM,代码中有 RAG 模式"
154
+ }
155
+ ],
156
+ "ragDetected": {
157
+ "hasVectorStore": true,
158
+ "hasLLM": true,
159
+ "isRAG": true
160
+ },
161
+ "evaluationRequired": true,
162
+ "suggestions": [
163
+ {
164
+ "component": "rag-application",
165
+ "recommendation": "建议创建评测数据集测试检索和生成质量"
166
+ }
167
+ ]
168
+ }
169
+ ```
@@ -0,0 +1,276 @@
1
+ ---
2
+ name: data-validator
3
+ description: 当项目包含数据开发组件(ETL、数据管道、数据仓库等)时,分析数据质量、验证数据完整性、提供数据分析支持。用于数据开发的稳定、安全、效果保障。
4
+ type: ai
5
+ ---
6
+
7
+ ## 定位
8
+
9
+ AI专用技能。为数据开发提供质量验证、分析和挖掘支持,参考AI评测方法论到数据领域。
10
+
11
+ ## 输入
12
+
13
+ - 必须输入:数据代码路径或项目目录、数据源配置
14
+ - 可选输入:数据质量要求、验证规则
15
+
16
+ ## 输出
17
+
18
+ ```json
19
+ {
20
+ "dataSystem": {
21
+ "type": "etl-pipeline",
22
+ "components": ["data-source", "transformation", "data-warehouse"],
23
+ "qualityRequirements": ["completeness", "accuracy", "timeliness"]
24
+ },
25
+ "validations": [
26
+ {
27
+ "dimension": "completeness",
28
+ "checks": ["null_check", "missing_value_check"],
29
+ "thresholds": {"null_rate": 0.05}
30
+ },
31
+ {
32
+ "dimension": "accuracy",
33
+ "checks": ["range_check", "format_check"],
34
+ "thresholds": {"error_rate": 0.01}
35
+ },
36
+ {
37
+ "dimension": "timeliness",
38
+ "checks": ["freshness_check", "delay_check"],
39
+ "thresholds": {"max_delay_hours": 24}
40
+ }
41
+ ],
42
+ "analysisSupport": {
43
+ "dataProfile": true,
44
+ "anomalyDetection": true,
45
+ "trendAnalysis": false
46
+ },
47
+ "tools": {
48
+ "suggested": ["great-expectations", "pandas-profiling", "dbt"],
49
+ "reason": "这些工具支持数据质量验证和分析"
50
+ }
51
+ }
52
+ ```
53
+
54
+ ## 执行步骤
55
+
56
+ ### Step 1: 识别数据系统类型(确定性)
57
+
58
+ 基于依赖和代码特征识别数据系统:
59
+
60
+ ```
61
+ 数据系统识别规则:
62
+ - dbt: 依赖dbt,代码中有 dbt run, dbt test
63
+ - airflow: 依赖airflow,代码中有 DAG, Task
64
+ - spark: 依赖pyspark,代码中有 SparkSession, DataFrame
65
+ - pandas: 依赖pandas,代码中有 read_csv, read_excel
66
+ - etl: 自定义ETL实现
67
+ ```
68
+
69
+ ### Step 2: Grep获取数据处理逻辑(高效)
70
+
71
+ 搜索数据处理模式:
72
+
73
+ ```bash
74
+ # 数据读取
75
+ grep -r -n -A 5 "read_csv\|read_excel\|load\|fetch" --include="*.py" .
76
+
77
+ # 数据转换
78
+ grep -r -n -A 5 "transform\|aggregate\|join\|filter" --include="*.py" .
79
+
80
+ # 数据写入
81
+ grep -r -n -A 5 "to_csv\|write\|save\|export" --include="*.py" .
82
+ ```
83
+
84
+ ### Step 3: 分析数据质量要求(模型)
85
+
86
+ 基于代码上下文分析数据质量要求:
87
+
88
+ ```
89
+ 关注点:
90
+ - 数据源类型和可靠性
91
+ - 数据转换逻辑
92
+ - 数据质量检查点
93
+ - 错误处理机制
94
+ ```
95
+
96
+ ### Step 4: 设计验证方案(模型)
97
+
98
+ 设计数据质量验证方案:
99
+
100
+ ```
101
+ 验证维度:
102
+ 1. 完整性:空值、缺失记录、重复数据
103
+ 2. 准确性:格式、范围、一致性
104
+ 3. 时效性:更新频率、延迟
105
+ 4. 唯一性:主键重复、外键关联
106
+ 5. 合法性:业务规则、数据约束
107
+ ```
108
+
109
+ ### Step 5: 输出结果
110
+
111
+ 汇总数据验证方案,包括验证规则、工具建议和实施计划。
112
+
113
+ ## 数据质量维度
114
+
115
+ ### 1. 完整性 (Completeness)
116
+
117
+ | 检查项 | 指标 | 方法 |
118
+ |--------|------|------|
119
+ | 空值检查 | null_rate | 统计null/空字符串比例 |
120
+ | 缺失记录 | missing_count | 对比预期记录数 |
121
+ | 重复数据 | duplicate_rate | 检查主键重复 |
122
+
123
+ ### 2. 准确性 (Accuracy)
124
+
125
+ | 检查项 | 指标 | 方法 |
126
+ |--------|------|------|
127
+ | 格式检查 | format_error_rate | 正则表达式验证 |
128
+ | 范围检查 | range_error_rate | 数值范围验证 |
129
+ | 一致性检查 | consistency_rate | 跨表一致性验证 |
130
+
131
+ ### 3. 时效性 (Timeliness)
132
+
133
+ | 检查项 | 指标 | 方法 |
134
+ |--------|------|------|
135
+ | 新鲜度 | data_freshness | 数据更新时间 |
136
+ | 延迟 | delay_hours | 数据延迟时间 |
137
+ | 调度 | schedule_adherence | 调度执行情况 |
138
+
139
+ ### 4. 唯一性 (Uniqueness)
140
+
141
+ | 检查项 | 指标 | 方法 |
142
+ |--------|------|------|
143
+ | 主键重复 | pk_duplicate_rate | 主键唯一性 |
144
+ | 外键关联 | fk_violation_rate | 外键完整性 |
145
+
146
+ ### 5. 合法性 (Validity)
147
+
148
+ | 检查项 | 指标 | 方法 |
149
+ |--------|------|------|
150
+ | 业务规则 | business_rule_violation | 业务规则验证 |
151
+ | 数据约束 | constraint_violation | 约束检查 |
152
+
153
+ ## 数据分析支持
154
+
155
+ ### 数据分析维度
156
+
157
+ | 维度 | 说明 | 工具 |
158
+ |------|------|------|
159
+ | **数据画像** | 数据的基本统计信息 | pandas-profiling, sweetviz |
160
+ | **异常检测** | 识别异常数据点 | isolation-forest, z-score |
161
+ | **趋势分析** | 数据随时间变化趋势 | pandas, matplotlib |
162
+ | **相关性分析** | 变量间相关关系 | pandas, seaborn |
163
+ | **分布分析** | 数据分布特征 | histogram, boxplot |
164
+
165
+ ### 数据挖掘支持
166
+
167
+ | 任务 | 说明 | 适用场景 |
168
+ |------|------|----------|
169
+ | **特征工程** | 特征提取和转换 | 机器学习准备 |
170
+ | **数据标注** | 标注数据生成 | AI模型训练 |
171
+ | **数据增强** | 数据扩充 | 样本不足 |
172
+ | **数据清洗** | 脏数据处理 | 质量提升 |
173
+
174
+ ## 数据验证工具
175
+
176
+ ### 开源工具
177
+
178
+ | 工具 | 说明 | 特点 |
179
+ |------|------|------|
180
+ | **Great Expectations** | 数据质量验证 | 声明式验证规则 |
181
+ | **Pandas Profiling** | 数据画像 | 自动生成报告 |
182
+ | **Dbt** | 数据转换 | SQL式ETL |
183
+ | **Great Expectations** | 数据测试 | 集成测试 |
184
+ | **Soda** | 数据监控 | 主动监控 |
185
+
186
+ ### 验证规则模板
187
+
188
+ ```python
189
+ # 完整性验证
190
+ expect_column_values_to_not_be_null("email")
191
+ expect_column_value_lengths_to_be_between("phone", 10, 15)
192
+ expect_table_row_count_to_be_between("orders", 1000, 1000000)
193
+
194
+ # 准确性验证
195
+ expect_column_values_to_match_regex("email", ".*@.*\\..*")
196
+ expect_column_values_to_be_between("price", 0, 10000)
197
+ expect_column_values_to_be_in_set("status", ["active", "inactive"])
198
+
199
+ # 唯一性验证
200
+ expect_column_values_to_be_unique("user_id")
201
+ expect_compound_columns_to_be_unique(["order_id", "line_item_id"])
202
+ ```
203
+
204
+ ## 调用时机
205
+
206
+ - 检测到数据系统组件时
207
+ - 数据管道开发阶段
208
+ - 数据质量保障需求
209
+ - 数据分析需求
210
+
211
+ ## 示例
212
+
213
+ **输入**:
214
+ ```
215
+ 项目目录:/project
216
+ 数据系统:dbt + snowflake
217
+ 数据源:用户行为数据
218
+ ```
219
+
220
+ **Grep搜索**:
221
+ ```
222
+ 找到文件:models/users.sql, tests/schema_test.yml
223
+ ```
224
+
225
+ **代码上下文**:
226
+ ```sql
227
+ -- models/users.sql
228
+ SELECT
229
+ user_id,
230
+ email,
231
+ created_at,
232
+ status
233
+ FROM raw_users
234
+ WHERE status = 'active'
235
+ ```
236
+
237
+ **输出**:
238
+ ```json
239
+ {
240
+ "dataSystem": {
241
+ "type": "dbt-pipeline",
242
+ "components": ["raw_users", "users_model"],
243
+ "qualityRequirements": ["completeness", "accuracy"]
244
+ },
245
+ "validations": [
246
+ {
247
+ "dimension": "completeness",
248
+ "checks": ["not_null", "unique"],
249
+ "fields": ["user_id", "email"],
250
+ "thresholds": {"null_rate": 0}
251
+ },
252
+ {
253
+ "dimension": "accuracy",
254
+ "checks": ["valid_email", "valid_status"],
255
+ "fields": ["email", "status"],
256
+ "thresholds": {"error_rate": 0.01}
257
+ }
258
+ ],
259
+ "tools": {
260
+ "suggested": ["dbt-utils", "great-expectations"],
261
+ "reason": "与dbt集成良好"
262
+ }
263
+ }
264
+ ```
265
+
266
+ ## 与AI评测的类比
267
+
268
+ 将AI评测方法论应用到数据开发:
269
+
270
+ | AI评测 | 数据开发 | 说明 |
271
+ |--------|----------|------|
272
+ | 任务完成率 | 数据完整率 | 数据是否完整 |
273
+ | 工具使用正确性 | 转换逻辑正确性 | ETL逻辑是否正确 |
274
+ | 响应质量 | 数据准确性 | 数据是否准确 |
275
+ | 基础设施噪声 | 数据源波动 | 源数据质量问题 |
276
+ | AI-resistant评估 | 数据一致性验证 | 防止数据泄露 |