code-abyss 1.6.16 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/bin/install.js +59 -163
- package/bin/lib/ccline.js +82 -0
- package/bin/lib/utils.js +61 -0
- package/package.json +5 -2
- package/skills/SKILL.md +24 -16
- package/skills/domains/ai/SKILL.md +2 -2
- package/skills/domains/ai/prompt-and-eval.md +279 -0
- package/skills/domains/architecture/SKILL.md +2 -3
- package/skills/domains/architecture/security-arch.md +87 -0
- package/skills/domains/data-engineering/SKILL.md +188 -26
- package/skills/domains/development/SKILL.md +1 -4
- package/skills/domains/devops/SKILL.md +3 -5
- package/skills/domains/devops/performance.md +63 -0
- package/skills/domains/devops/testing.md +97 -0
- package/skills/domains/frontend-design/SKILL.md +12 -3
- package/skills/domains/frontend-design/claymorphism/SKILL.md +117 -0
- package/skills/domains/frontend-design/claymorphism/references/tokens.css +52 -0
- package/skills/domains/frontend-design/engineering.md +287 -0
- package/skills/domains/frontend-design/glassmorphism/SKILL.md +138 -0
- package/skills/domains/frontend-design/glassmorphism/references/tokens.css +32 -0
- package/skills/domains/frontend-design/liquid-glass/SKILL.md +135 -0
- package/skills/domains/frontend-design/liquid-glass/references/tokens.css +81 -0
- package/skills/domains/frontend-design/neubrutalism/SKILL.md +141 -0
- package/skills/domains/frontend-design/neubrutalism/references/tokens.css +44 -0
- package/skills/domains/infrastructure/SKILL.md +174 -34
- package/skills/domains/mobile/SKILL.md +211 -21
- package/skills/domains/orchestration/SKILL.md +1 -0
- package/skills/domains/security/SKILL.md +4 -6
- package/skills/domains/security/blue-team.md +57 -0
- package/skills/domains/security/red-team.md +54 -0
- package/skills/domains/security/threat-intel.md +50 -0
- package/skills/orchestration/multi-agent/SKILL.md +195 -46
- package/skills/run_skill.js +139 -0
- package/skills/tools/gen-docs/SKILL.md +6 -4
- package/skills/tools/gen-docs/scripts/doc_generator.js +363 -0
- package/skills/tools/lib/shared.js +98 -0
- package/skills/tools/verify-change/SKILL.md +8 -6
- package/skills/tools/verify-change/scripts/change_analyzer.js +289 -0
- package/skills/tools/verify-module/SKILL.md +6 -4
- package/skills/tools/verify-module/scripts/module_scanner.js +171 -0
- package/skills/tools/verify-quality/SKILL.md +5 -3
- package/skills/tools/verify-quality/scripts/quality_checker.js +337 -0
- package/skills/tools/verify-security/SKILL.md +7 -5
- package/skills/tools/verify-security/scripts/security_scanner.js +283 -0
- package/skills/__pycache__/run_skill.cpython-312.pyc +0 -0
- package/skills/domains/COVERAGE_PLAN.md +0 -232
- package/skills/domains/ai/model-evaluation.md +0 -790
- package/skills/domains/ai/prompt-engineering.md +0 -703
- package/skills/domains/architecture/compliance.md +0 -299
- package/skills/domains/architecture/data-security.md +0 -184
- package/skills/domains/data-engineering/data-pipeline.md +0 -762
- package/skills/domains/data-engineering/data-quality.md +0 -894
- package/skills/domains/data-engineering/stream-processing.md +0 -791
- package/skills/domains/development/dart.md +0 -963
- package/skills/domains/development/kotlin.md +0 -834
- package/skills/domains/development/php.md +0 -659
- package/skills/domains/development/swift.md +0 -755
- package/skills/domains/devops/e2e-testing.md +0 -914
- package/skills/domains/devops/performance-testing.md +0 -734
- package/skills/domains/devops/testing-strategy.md +0 -667
- package/skills/domains/frontend-design/build-tools.md +0 -743
- package/skills/domains/frontend-design/performance.md +0 -734
- package/skills/domains/frontend-design/testing.md +0 -699
- package/skills/domains/infrastructure/gitops.md +0 -735
- package/skills/domains/infrastructure/iac.md +0 -855
- package/skills/domains/infrastructure/kubernetes.md +0 -1018
- package/skills/domains/mobile/android-dev.md +0 -979
- package/skills/domains/mobile/cross-platform.md +0 -795
- package/skills/domains/mobile/ios-dev.md +0 -931
- package/skills/domains/security/secrets-management.md +0 -834
- package/skills/domains/security/supply-chain.md +0 -931
- package/skills/domains/security/threat-modeling.md +0 -828
- package/skills/run_skill.py +0 -153
- package/skills/tests/README.md +0 -225
- package/skills/tests/SUMMARY.md +0 -362
- package/skills/tests/__init__.py +0 -3
- package/skills/tests/__pycache__/test_change_analyzer.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_doc_generator.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_module_scanner.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_quality_checker.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_security_scanner.cpython-312.pyc +0 -0
- package/skills/tests/test_change_analyzer.py +0 -558
- package/skills/tests/test_doc_generator.py +0 -538
- package/skills/tests/test_module_scanner.py +0 -376
- package/skills/tests/test_quality_checker.py +0 -516
- package/skills/tests/test_security_scanner.py +0 -426
- package/skills/tools/gen-docs/scripts/__pycache__/doc_generator.cpython-312.pyc +0 -0
- package/skills/tools/gen-docs/scripts/doc_generator.py +0 -520
- package/skills/tools/verify-change/scripts/__pycache__/change_analyzer.cpython-312.pyc +0 -0
- package/skills/tools/verify-change/scripts/change_analyzer.py +0 -529
- package/skills/tools/verify-module/scripts/__pycache__/module_scanner.cpython-312.pyc +0 -0
- package/skills/tools/verify-module/scripts/module_scanner.py +0 -321
- package/skills/tools/verify-quality/scripts/__pycache__/quality_checker.cpython-312.pyc +0 -0
- package/skills/tools/verify-quality/scripts/quality_checker.py +0 -481
- package/skills/tools/verify-security/scripts/__pycache__/security_scanner.cpython-312.pyc +0 -0
- package/skills/tools/verify-security/scripts/security_scanner.py +0 -374
|
@@ -1,894 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: data-quality
|
|
3
|
-
description: 数据质量保障。Great Expectations、dbt、数据验证、数据测试、数据血缘、完整性检查。当用户提到数据质量、Great Expectations、dbt、数据验证、数据测试时使用。
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# 🎯 数据质量秘典 · Data Quality
|
|
7
|
-
|
|
8
|
-
## 质量维度
|
|
9
|
-
|
|
10
|
-
```
|
|
11
|
-
完整性 → 准确性 → 一致性 → 及时性 → 有效性
|
|
12
|
-
│ │ │ │ │
|
|
13
|
-
└─ 非空 ─┴─ 范围 ─┴─ 关联 ─┴─ 新鲜度 ─┴─ 格式
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
## Great Expectations 基础
|
|
17
|
-
|
|
18
|
-
### 安装和初始化
|
|
19
|
-
|
|
20
|
-
```bash
|
|
21
|
-
# 安装
|
|
22
|
-
pip install great_expectations
|
|
23
|
-
|
|
24
|
-
# 初始化项目
|
|
25
|
-
great_expectations init
|
|
26
|
-
|
|
27
|
-
# 项目结构
|
|
28
|
-
great_expectations/
|
|
29
|
-
├── great_expectations.yml
|
|
30
|
-
├── expectations/
|
|
31
|
-
├── checkpoints/
|
|
32
|
-
├── plugins/
|
|
33
|
-
└── uncommitted/
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
### 创建 Data Context
|
|
37
|
-
|
|
38
|
-
```python
|
|
39
|
-
import great_expectations as gx
|
|
40
|
-
from great_expectations.data_context import FileDataContext
|
|
41
|
-
|
|
42
|
-
# 获取 Data Context
|
|
43
|
-
context = gx.get_context()
|
|
44
|
-
|
|
45
|
-
# 添加数据源
|
|
46
|
-
datasource = context.sources.add_pandas("my_datasource")
|
|
47
|
-
|
|
48
|
-
# 添加数据资产
|
|
49
|
-
data_asset = datasource.add_dataframe_asset(name="users_df")
|
|
50
|
-
|
|
51
|
-
# 构建批次请求
|
|
52
|
-
batch_request = data_asset.build_batch_request(dataframe=df)
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
### Expectations 定义
|
|
56
|
-
|
|
57
|
-
```python
|
|
58
|
-
import pandas as pd
|
|
59
|
-
import great_expectations as gx
|
|
60
|
-
|
|
61
|
-
# 创建 Validator
|
|
62
|
-
context = gx.get_context()
|
|
63
|
-
validator = context.sources.pandas_default.read_dataframe(df)
|
|
64
|
-
|
|
65
|
-
# 基础 Expectations
|
|
66
|
-
validator.expect_table_row_count_to_be_between(min_value=100, max_value=10000)
|
|
67
|
-
validator.expect_table_column_count_to_equal(value=5)
|
|
68
|
-
|
|
69
|
-
# 列存在性
|
|
70
|
-
validator.expect_column_to_exist(column="user_id")
|
|
71
|
-
validator.expect_column_to_exist(column="email")
|
|
72
|
-
|
|
73
|
-
# 非空检查
|
|
74
|
-
validator.expect_column_values_to_not_be_null(column="user_id")
|
|
75
|
-
validator.expect_column_values_to_not_be_null(column="email")
|
|
76
|
-
|
|
77
|
-
# 唯一性检查
|
|
78
|
-
validator.expect_column_values_to_be_unique(column="user_id")
|
|
79
|
-
validator.expect_column_values_to_be_unique(column="email")
|
|
80
|
-
|
|
81
|
-
# 值范围检查
|
|
82
|
-
validator.expect_column_values_to_be_between(
|
|
83
|
-
column="age",
|
|
84
|
-
min_value=0,
|
|
85
|
-
max_value=120
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
# 值集合检查
|
|
89
|
-
validator.expect_column_values_to_be_in_set(
|
|
90
|
-
column="status",
|
|
91
|
-
value_set=["active", "inactive", "pending"]
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
# 正则表达式检查
|
|
95
|
-
validator.expect_column_values_to_match_regex(
|
|
96
|
-
column="email",
|
|
97
|
-
regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
# 类型检查
|
|
101
|
-
validator.expect_column_values_to_be_of_type(
|
|
102
|
-
column="age",
|
|
103
|
-
type_="int64"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
# 保存 Expectation Suite
|
|
107
|
-
validator.save_expectation_suite(discard_failed_expectations=False)
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
### 自定义 Expectations
|
|
111
|
-
|
|
112
|
-
```python
|
|
113
|
-
from great_expectations.expectations.expectation import ColumnMapExpectation
|
|
114
|
-
from great_expectations.execution_engine import PandasExecutionEngine
|
|
115
|
-
|
|
116
|
-
class ExpectColumnValuesToBeValidPhoneNumber(ColumnMapExpectation):
|
|
117
|
-
"""期望列值为有效电话号码"""
|
|
118
|
-
|
|
119
|
-
map_metric = "column_values.match_phone_pattern"
|
|
120
|
-
|
|
121
|
-
@classmethod
|
|
122
|
-
def _atomic_prescriptive_template(cls, **kwargs):
|
|
123
|
-
return "values must be valid phone numbers"
|
|
124
|
-
|
|
125
|
-
@classmethod
|
|
126
|
-
def _prescriptive_template(cls, **kwargs):
|
|
127
|
-
return "At least $mostly_pct % of values in $column must be valid phone numbers"
|
|
128
|
-
|
|
129
|
-
# 注册自定义 Expectation
|
|
130
|
-
validator.expect_column_values_to_be_valid_phone_number(
|
|
131
|
-
column="phone",
|
|
132
|
-
mostly=0.95
|
|
133
|
-
)
|
|
134
|
-
```
|
|
135
|
-
|
|
136
|
-
### Checkpoints 执行
|
|
137
|
-
|
|
138
|
-
```python
|
|
139
|
-
# 创建 Checkpoint
|
|
140
|
-
checkpoint_config = {
|
|
141
|
-
"name": "my_checkpoint",
|
|
142
|
-
"config_version": 1.0,
|
|
143
|
-
"class_name": "SimpleCheckpoint",
|
|
144
|
-
"validations": [
|
|
145
|
-
{
|
|
146
|
-
"batch_request": {
|
|
147
|
-
"datasource_name": "my_datasource",
|
|
148
|
-
"data_asset_name": "users_df",
|
|
149
|
-
},
|
|
150
|
-
"expectation_suite_name": "users_suite",
|
|
151
|
-
}
|
|
152
|
-
],
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
context.add_checkpoint(**checkpoint_config)
|
|
156
|
-
|
|
157
|
-
# 运行 Checkpoint
|
|
158
|
-
result = context.run_checkpoint(
|
|
159
|
-
checkpoint_name="my_checkpoint",
|
|
160
|
-
batch_request=batch_request,
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# 检查结果
|
|
164
|
-
if result["success"]:
|
|
165
|
-
print("All expectations passed!")
|
|
166
|
-
else:
|
|
167
|
-
print("Some expectations failed:")
|
|
168
|
-
for validation in result["run_results"].values():
|
|
169
|
-
for result in validation["validation_result"]["results"]:
|
|
170
|
-
if not result["success"]:
|
|
171
|
-
print(f" - {result['expectation_config']['expectation_type']}")
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
### Data Docs 生成
|
|
175
|
-
|
|
176
|
-
```python
|
|
177
|
-
# 构建 Data Docs
|
|
178
|
-
context.build_data_docs()
|
|
179
|
-
|
|
180
|
-
# 打开 Data Docs
|
|
181
|
-
context.open_data_docs()
|
|
182
|
-
|
|
183
|
-
# 自定义 Data Docs 站点
|
|
184
|
-
data_docs_config = {
|
|
185
|
-
"sites": {
|
|
186
|
-
"local_site": {
|
|
187
|
-
"class_name": "SiteBuilder",
|
|
188
|
-
"store_backend": {
|
|
189
|
-
"class_name": "TupleFilesystemStoreBackend",
|
|
190
|
-
"base_directory": "uncommitted/data_docs/local_site/",
|
|
191
|
-
},
|
|
192
|
-
"site_index_builder": {
|
|
193
|
-
"class_name": "DefaultSiteIndexBuilder",
|
|
194
|
-
},
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
## dbt 数据测试
|
|
201
|
-
|
|
202
|
-
### 项目结构
|
|
203
|
-
|
|
204
|
-
```yaml
|
|
205
|
-
# dbt_project.yml
|
|
206
|
-
name: 'my_project'
|
|
207
|
-
version: '1.0.0'
|
|
208
|
-
config-version: 2
|
|
209
|
-
|
|
210
|
-
profile: 'default'
|
|
211
|
-
|
|
212
|
-
model-paths: ["models"]
|
|
213
|
-
test-paths: ["tests"]
|
|
214
|
-
seed-paths: ["seeds"]
|
|
215
|
-
macro-paths: ["macros"]
|
|
216
|
-
|
|
217
|
-
models:
|
|
218
|
-
my_project:
|
|
219
|
-
+materialized: table
|
|
220
|
-
```
|
|
221
|
-
|
|
222
|
-
### Schema 测试
|
|
223
|
-
|
|
224
|
-
```yaml
|
|
225
|
-
# models/schema.yml
|
|
226
|
-
version: 2
|
|
227
|
-
|
|
228
|
-
models:
|
|
229
|
-
- name: users
|
|
230
|
-
description: "User table"
|
|
231
|
-
columns:
|
|
232
|
-
- name: user_id
|
|
233
|
-
description: "Primary key"
|
|
234
|
-
tests:
|
|
235
|
-
- unique
|
|
236
|
-
- not_null
|
|
237
|
-
|
|
238
|
-
- name: email
|
|
239
|
-
description: "User email"
|
|
240
|
-
tests:
|
|
241
|
-
- unique
|
|
242
|
-
- not_null
|
|
243
|
-
- dbt_utils.email
|
|
244
|
-
|
|
245
|
-
- name: age
|
|
246
|
-
description: "User age"
|
|
247
|
-
tests:
|
|
248
|
-
- not_null
|
|
249
|
-
- dbt_utils.accepted_range:
|
|
250
|
-
min_value: 0
|
|
251
|
-
max_value: 120
|
|
252
|
-
|
|
253
|
-
- name: status
|
|
254
|
-
description: "User status"
|
|
255
|
-
tests:
|
|
256
|
-
- not_null
|
|
257
|
-
- accepted_values:
|
|
258
|
-
values: ['active', 'inactive', 'pending']
|
|
259
|
-
|
|
260
|
-
- name: created_at
|
|
261
|
-
description: "Creation timestamp"
|
|
262
|
-
tests:
|
|
263
|
-
- not_null
|
|
264
|
-
- dbt_utils.not_future_date
|
|
265
|
-
|
|
266
|
-
- name: country_code
|
|
267
|
-
description: "Country code"
|
|
268
|
-
tests:
|
|
269
|
-
- relationships:
|
|
270
|
-
to: ref('countries')
|
|
271
|
-
field: code
|
|
272
|
-
```
|
|
273
|
-
|
|
274
|
-
### 自定义 Data 测试
|
|
275
|
-
|
|
276
|
-
```sql
|
|
277
|
-
-- tests/assert_positive_revenue.sql
|
|
278
|
-
-- 测试收入必须为正数
|
|
279
|
-
|
|
280
|
-
SELECT
|
|
281
|
-
order_id,
|
|
282
|
-
revenue
|
|
283
|
-
FROM {{ ref('orders') }}
|
|
284
|
-
WHERE revenue <= 0
|
|
285
|
-
```
|
|
286
|
-
|
|
287
|
-
```sql
|
|
288
|
-
-- tests/assert_user_email_domain.sql
|
|
289
|
-
-- 测试用户邮箱域名
|
|
290
|
-
|
|
291
|
-
SELECT
|
|
292
|
-
user_id,
|
|
293
|
-
email
|
|
294
|
-
FROM {{ ref('users') }}
|
|
295
|
-
WHERE email NOT LIKE '%@company.com'
|
|
296
|
-
AND email NOT LIKE '%@partner.com'
|
|
297
|
-
```
|
|
298
|
-
|
|
299
|
-
### Generic 测试
|
|
300
|
-
|
|
301
|
-
```sql
|
|
302
|
-
-- macros/test_valid_date_range.sql
|
|
303
|
-
{% test valid_date_range(model, column_name, start_date, end_date) %}
|
|
304
|
-
|
|
305
|
-
SELECT *
|
|
306
|
-
FROM {{ model }}
|
|
307
|
-
WHERE {{ column_name }} < '{{ start_date }}'
|
|
308
|
-
OR {{ column_name }} > '{{ end_date }}'
|
|
309
|
-
|
|
310
|
-
{% endtest %}
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
```yaml
|
|
314
|
-
# 使用 Generic 测试
|
|
315
|
-
models:
|
|
316
|
-
- name: events
|
|
317
|
-
columns:
|
|
318
|
-
- name: event_date
|
|
319
|
-
tests:
|
|
320
|
-
- valid_date_range:
|
|
321
|
-
start_date: '2020-01-01'
|
|
322
|
-
end_date: '2025-12-31'
|
|
323
|
-
```
|
|
324
|
-
|
|
325
|
-
### Singular 测试
|
|
326
|
-
|
|
327
|
-
```sql
|
|
328
|
-
-- tests/assert_revenue_consistency.sql
|
|
329
|
-
-- 测试收入一致性
|
|
330
|
-
|
|
331
|
-
WITH order_revenue AS (
|
|
332
|
-
SELECT SUM(amount) AS total
|
|
333
|
-
FROM {{ ref('orders') }}
|
|
334
|
-
),
|
|
335
|
-
payment_revenue AS (
|
|
336
|
-
SELECT SUM(amount) AS total
|
|
337
|
-
FROM {{ ref('payments') }}
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
SELECT
|
|
341
|
-
o.total AS order_total,
|
|
342
|
-
p.total AS payment_total,
|
|
343
|
-
ABS(o.total - p.total) AS difference
|
|
344
|
-
FROM order_revenue o
|
|
345
|
-
CROSS JOIN payment_revenue p
|
|
346
|
-
WHERE ABS(o.total - p.total) > 0.01
|
|
347
|
-
```
|
|
348
|
-
|
|
349
|
-
### dbt 测试执行
|
|
350
|
-
|
|
351
|
-
```bash
|
|
352
|
-
# 运行所有测试
|
|
353
|
-
dbt test
|
|
354
|
-
|
|
355
|
-
# 运行特定模型的测试
|
|
356
|
-
dbt test --select users
|
|
357
|
-
|
|
358
|
-
# 运行特定测试
|
|
359
|
-
dbt test --select test_name:unique_users_user_id
|
|
360
|
-
|
|
361
|
-
# 运行失败的测试
|
|
362
|
-
dbt test --select result:fail
|
|
363
|
-
|
|
364
|
-
# 存储测试失败记录
|
|
365
|
-
dbt test --store-failures
|
|
366
|
-
```
|
|
367
|
-
|
|
368
|
-
### dbt Expectations 包
|
|
369
|
-
|
|
370
|
-
```yaml
|
|
371
|
-
# packages.yml
|
|
372
|
-
packages:
|
|
373
|
-
- package: calogica/dbt_expectations
|
|
374
|
-
version: 0.9.0
|
|
375
|
-
```
|
|
376
|
-
|
|
377
|
-
```yaml
|
|
378
|
-
# 使用 dbt_expectations
|
|
379
|
-
models:
|
|
380
|
-
- name: users
|
|
381
|
-
columns:
|
|
382
|
-
- name: email
|
|
383
|
-
tests:
|
|
384
|
-
- dbt_expectations.expect_column_values_to_match_regex:
|
|
385
|
-
regex: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
|
|
386
|
-
|
|
387
|
-
- name: age
|
|
388
|
-
tests:
|
|
389
|
-
- dbt_expectations.expect_column_mean_to_be_between:
|
|
390
|
-
min_value: 18
|
|
391
|
-
max_value: 65
|
|
392
|
-
|
|
393
|
-
- name: created_at
|
|
394
|
-
tests:
|
|
395
|
-
- dbt_expectations.expect_row_values_to_have_recent_data:
|
|
396
|
-
datepart: day
|
|
397
|
-
interval: 7
|
|
398
|
-
```
|
|
399
|
-
|
|
400
|
-
## 数据验证规则
|
|
401
|
-
|
|
402
|
-
### 完整性检查
|
|
403
|
-
|
|
404
|
-
```python
|
|
405
|
-
import pandas as pd
|
|
406
|
-
|
|
407
|
-
def check_completeness(df: pd.DataFrame, required_columns: list) -> dict:
|
|
408
|
-
"""检查数据完整性"""
|
|
409
|
-
results = {}
|
|
410
|
-
|
|
411
|
-
# 检查必需列
|
|
412
|
-
missing_columns = set(required_columns) - set(df.columns)
|
|
413
|
-
results['missing_columns'] = list(missing_columns)
|
|
414
|
-
|
|
415
|
-
# 检查空值
|
|
416
|
-
null_counts = df[required_columns].isnull().sum()
|
|
417
|
-
results['null_counts'] = null_counts.to_dict()
|
|
418
|
-
|
|
419
|
-
# 检查空字符串
|
|
420
|
-
for col in required_columns:
|
|
421
|
-
if df[col].dtype == 'object':
|
|
422
|
-
empty_count = (df[col] == '').sum()
|
|
423
|
-
results[f'{col}_empty_count'] = empty_count
|
|
424
|
-
|
|
425
|
-
return results
|
|
426
|
-
|
|
427
|
-
# 使用示例
|
|
428
|
-
required_cols = ['user_id', 'email', 'name']
|
|
429
|
-
completeness = check_completeness(df, required_cols)
|
|
430
|
-
```
|
|
431
|
-
|
|
432
|
-
### 准确性检查
|
|
433
|
-
|
|
434
|
-
```python
|
|
435
|
-
def check_accuracy(df: pd.DataFrame) -> dict:
|
|
436
|
-
"""检查数据准确性"""
|
|
437
|
-
results = {}
|
|
438
|
-
|
|
439
|
-
# 数值范围检查
|
|
440
|
-
if 'age' in df.columns:
|
|
441
|
-
invalid_age = df[(df['age'] < 0) | (df['age'] > 120)]
|
|
442
|
-
results['invalid_age_count'] = len(invalid_age)
|
|
443
|
-
|
|
444
|
-
# 格式检查
|
|
445
|
-
if 'email' in df.columns:
|
|
446
|
-
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
447
|
-
invalid_email = df[~df['email'].str.match(email_pattern, na=False)]
|
|
448
|
-
results['invalid_email_count'] = len(invalid_email)
|
|
449
|
-
|
|
450
|
-
# 逻辑检查
|
|
451
|
-
if 'start_date' in df.columns and 'end_date' in df.columns:
|
|
452
|
-
invalid_dates = df[df['start_date'] > df['end_date']]
|
|
453
|
-
results['invalid_date_range_count'] = len(invalid_dates)
|
|
454
|
-
|
|
455
|
-
return results
|
|
456
|
-
```
|
|
457
|
-
|
|
458
|
-
### 一致性检查
|
|
459
|
-
|
|
460
|
-
```python
|
|
461
|
-
def check_consistency(df1: pd.DataFrame, df2: pd.DataFrame, key: str) -> dict:
|
|
462
|
-
"""检查数据一致性"""
|
|
463
|
-
results = {}
|
|
464
|
-
|
|
465
|
-
# 主键一致性
|
|
466
|
-
keys1 = set(df1[key])
|
|
467
|
-
keys2 = set(df2[key])
|
|
468
|
-
|
|
469
|
-
results['only_in_df1'] = len(keys1 - keys2)
|
|
470
|
-
results['only_in_df2'] = len(keys2 - keys1)
|
|
471
|
-
results['in_both'] = len(keys1 & keys2)
|
|
472
|
-
|
|
473
|
-
# 值一致性
|
|
474
|
-
merged = df1.merge(df2, on=key, suffixes=('_1', '_2'))
|
|
475
|
-
for col in df1.columns:
|
|
476
|
-
if col != key and f'{col}_2' in merged.columns:
|
|
477
|
-
inconsistent = merged[merged[f'{col}_1'] != merged[f'{col}_2']]
|
|
478
|
-
results[f'{col}_inconsistent_count'] = len(inconsistent)
|
|
479
|
-
|
|
480
|
-
return results
|
|
481
|
-
```
|
|
482
|
-
|
|
483
|
-
### 及时性检查
|
|
484
|
-
|
|
485
|
-
```python
|
|
486
|
-
from datetime import datetime, timedelta
|
|
487
|
-
|
|
488
|
-
def check_timeliness(df: pd.DataFrame, timestamp_col: str, max_age_hours: int = 24) -> dict:
|
|
489
|
-
"""检查数据及时性"""
|
|
490
|
-
results = {}
|
|
491
|
-
|
|
492
|
-
df[timestamp_col] = pd.to_datetime(df[timestamp_col])
|
|
493
|
-
now = datetime.now()
|
|
494
|
-
threshold = now - timedelta(hours=max_age_hours)
|
|
495
|
-
|
|
496
|
-
# 过期数据
|
|
497
|
-
stale_data = df[df[timestamp_col] < threshold]
|
|
498
|
-
results['stale_count'] = len(stale_data)
|
|
499
|
-
results['stale_percentage'] = len(stale_data) / len(df) * 100
|
|
500
|
-
|
|
501
|
-
# 最新数据时间
|
|
502
|
-
results['latest_timestamp'] = df[timestamp_col].max()
|
|
503
|
-
results['oldest_timestamp'] = df[timestamp_col].min()
|
|
504
|
-
results['data_age_hours'] = (now - df[timestamp_col].max()).total_seconds() / 3600
|
|
505
|
-
|
|
506
|
-
return results
|
|
507
|
-
```
|
|
508
|
-
|
|
509
|
-
## 数据血缘追踪
|
|
510
|
-
|
|
511
|
-
### dbt 血缘
|
|
512
|
-
|
|
513
|
-
```sql
|
|
514
|
-
-- models/staging/stg_users.sql
|
|
515
|
-
SELECT
|
|
516
|
-
user_id,
|
|
517
|
-
email,
|
|
518
|
-
created_at
|
|
519
|
-
FROM {{ source('raw', 'users') }}
|
|
520
|
-
|
|
521
|
-
-- models/marts/dim_users.sql
|
|
522
|
-
SELECT
|
|
523
|
-
user_id,
|
|
524
|
-
email,
|
|
525
|
-
DATE(created_at) AS created_date
|
|
526
|
-
FROM {{ ref('stg_users') }}
|
|
527
|
-
|
|
528
|
-
-- models/marts/fct_orders.sql
|
|
529
|
-
SELECT
|
|
530
|
-
o.order_id,
|
|
531
|
-
u.user_id,
|
|
532
|
-
o.amount
|
|
533
|
-
FROM {{ ref('stg_orders') }} o
|
|
534
|
-
LEFT JOIN {{ ref('dim_users') }} u
|
|
535
|
-
ON o.user_id = u.user_id
|
|
536
|
-
```
|
|
537
|
-
|
|
538
|
-
```bash
|
|
539
|
-
# 生成血缘图
|
|
540
|
-
dbt docs generate
|
|
541
|
-
dbt docs serve
|
|
542
|
-
|
|
543
|
-
# 查看血缘关系
|
|
544
|
-
# http://localhost:8080
|
|
545
|
-
```
|
|
546
|
-
|
|
547
|
-
### 自定义血缘追踪
|
|
548
|
-
|
|
549
|
-
```python
|
|
550
|
-
from dataclasses import dataclass
|
|
551
|
-
from typing import List, Dict
|
|
552
|
-
|
|
553
|
-
@dataclass
|
|
554
|
-
class DataLineage:
|
|
555
|
-
"""数据血缘"""
|
|
556
|
-
table_name: str
|
|
557
|
-
upstream_tables: List[str]
|
|
558
|
-
transformation: str
|
|
559
|
-
created_at: str
|
|
560
|
-
|
|
561
|
-
class LineageTracker:
|
|
562
|
-
"""血缘追踪器"""
|
|
563
|
-
|
|
564
|
-
def __init__(self):
|
|
565
|
-
self.lineage: Dict[str, DataLineage] = {}
|
|
566
|
-
|
|
567
|
-
def register(self, table_name: str, upstream: List[str], transformation: str):
|
|
568
|
-
"""注册血缘关系"""
|
|
569
|
-
self.lineage[table_name] = DataLineage(
|
|
570
|
-
table_name=table_name,
|
|
571
|
-
upstream_tables=upstream,
|
|
572
|
-
transformation=transformation,
|
|
573
|
-
created_at=datetime.now().isoformat()
|
|
574
|
-
)
|
|
575
|
-
|
|
576
|
-
def get_upstream(self, table_name: str, recursive: bool = False) -> List[str]:
|
|
577
|
-
"""获取上游表"""
|
|
578
|
-
if table_name not in self.lineage:
|
|
579
|
-
return []
|
|
580
|
-
|
|
581
|
-
upstream = self.lineage[table_name].upstream_tables
|
|
582
|
-
|
|
583
|
-
if recursive:
|
|
584
|
-
all_upstream = set(upstream)
|
|
585
|
-
for table in upstream:
|
|
586
|
-
all_upstream.update(self.get_upstream(table, recursive=True))
|
|
587
|
-
return list(all_upstream)
|
|
588
|
-
|
|
589
|
-
return upstream
|
|
590
|
-
|
|
591
|
-
def get_downstream(self, table_name: str) -> List[str]:
|
|
592
|
-
"""获取下游表"""
|
|
593
|
-
downstream = []
|
|
594
|
-
for name, lineage in self.lineage.items():
|
|
595
|
-
if table_name in lineage.upstream_tables:
|
|
596
|
-
downstream.append(name)
|
|
597
|
-
return downstream
|
|
598
|
-
|
|
599
|
-
# 使用示例
|
|
600
|
-
tracker = LineageTracker()
|
|
601
|
-
|
|
602
|
-
tracker.register('stg_users', ['raw.users'], 'SELECT * FROM raw.users')
|
|
603
|
-
tracker.register('dim_users', ['stg_users'], 'SELECT user_id, email FROM stg_users')
|
|
604
|
-
tracker.register('fct_orders', ['stg_orders', 'dim_users'], 'JOIN transformation')
|
|
605
|
-
|
|
606
|
-
print(tracker.get_upstream('fct_orders', recursive=True))
|
|
607
|
-
# ['stg_orders', 'dim_users', 'stg_users', 'raw.users']
|
|
608
|
-
```
|
|
609
|
-
|
|
610
|
-
## 数据质量监控
|
|
611
|
-
|
|
612
|
-
### 质量指标计算
|
|
613
|
-
|
|
614
|
-
```python
|
|
615
|
-
import pandas as pd
|
|
616
|
-
from typing import Dict
|
|
617
|
-
|
|
618
|
-
class DataQualityMetrics:
|
|
619
|
-
"""数据质量指标"""
|
|
620
|
-
|
|
621
|
-
@staticmethod
|
|
622
|
-
def calculate_completeness(df: pd.DataFrame) -> float:
|
|
623
|
-
"""完整性得分"""
|
|
624
|
-
total_cells = df.size
|
|
625
|
-
non_null_cells = df.count().sum()
|
|
626
|
-
return (non_null_cells / total_cells) * 100
|
|
627
|
-
|
|
628
|
-
@staticmethod
|
|
629
|
-
def calculate_uniqueness(df: pd.DataFrame, key_columns: List[str]) -> float:
|
|
630
|
-
"""唯一性得分"""
|
|
631
|
-
total_rows = len(df)
|
|
632
|
-
unique_rows = df[key_columns].drop_duplicates().shape[0]
|
|
633
|
-
return (unique_rows / total_rows) * 100
|
|
634
|
-
|
|
635
|
-
@staticmethod
|
|
636
|
-
def calculate_validity(df: pd.DataFrame, rules: Dict) -> float:
|
|
637
|
-
"""有效性得分"""
|
|
638
|
-
total_rows = len(df)
|
|
639
|
-
valid_rows = total_rows
|
|
640
|
-
|
|
641
|
-
for column, rule in rules.items():
|
|
642
|
-
if rule['type'] == 'range':
|
|
643
|
-
invalid = df[
|
|
644
|
-
(df[column] < rule['min']) | (df[column] > rule['max'])
|
|
645
|
-
]
|
|
646
|
-
valid_rows -= len(invalid)
|
|
647
|
-
elif rule['type'] == 'regex':
|
|
648
|
-
invalid = df[~df[column].str.match(rule['pattern'], na=False)]
|
|
649
|
-
valid_rows -= len(invalid)
|
|
650
|
-
|
|
651
|
-
return (valid_rows / total_rows) * 100
|
|
652
|
-
|
|
653
|
-
@staticmethod
|
|
654
|
-
def calculate_overall_score(metrics: Dict[str, float]) -> float:
|
|
655
|
-
"""综合质量得分"""
|
|
656
|
-
weights = {
|
|
657
|
-
'completeness': 0.3,
|
|
658
|
-
'uniqueness': 0.2,
|
|
659
|
-
'validity': 0.3,
|
|
660
|
-
'timeliness': 0.2,
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
score = sum(metrics.get(k, 0) * v for k, v in weights.items())
|
|
664
|
-
return score
|
|
665
|
-
|
|
666
|
-
# 使用示例
|
|
667
|
-
metrics = DataQualityMetrics()
|
|
668
|
-
|
|
669
|
-
completeness = metrics.calculate_completeness(df)
|
|
670
|
-
uniqueness = metrics.calculate_uniqueness(df, ['user_id'])
|
|
671
|
-
validity = metrics.calculate_validity(df, {
|
|
672
|
-
'age': {'type': 'range', 'min': 0, 'max': 120}
|
|
673
|
-
})
|
|
674
|
-
|
|
675
|
-
overall = metrics.calculate_overall_score({
|
|
676
|
-
'completeness': completeness,
|
|
677
|
-
'uniqueness': uniqueness,
|
|
678
|
-
'validity': validity,
|
|
679
|
-
'timeliness': 95.0,
|
|
680
|
-
})
|
|
681
|
-
|
|
682
|
-
print(f"Overall Quality Score: {overall:.2f}%")
|
|
683
|
-
```
|
|
684
|
-
|
|
685
|
-
### 质量告警
|
|
686
|
-
|
|
687
|
-
```python
|
|
688
|
-
class QualityAlert:
|
|
689
|
-
"""质量告警"""
|
|
690
|
-
|
|
691
|
-
def __init__(self, thresholds: Dict[str, float]):
|
|
692
|
-
self.thresholds = thresholds
|
|
693
|
-
|
|
694
|
-
def check_and_alert(self, metrics: Dict[str, float]) -> List[str]:
|
|
695
|
-
"""检查并生成告警"""
|
|
696
|
-
alerts = []
|
|
697
|
-
|
|
698
|
-
for metric, value in metrics.items():
|
|
699
|
-
threshold = self.thresholds.get(metric)
|
|
700
|
-
if threshold and value < threshold:
|
|
701
|
-
alerts.append(
|
|
702
|
-
f"ALERT: {metric} is {value:.2f}%, "
|
|
703
|
-
f"below threshold {threshold}%"
|
|
704
|
-
)
|
|
705
|
-
|
|
706
|
-
return alerts
|
|
707
|
-
|
|
708
|
-
# 使用示例
|
|
709
|
-
alert_system = QualityAlert({
|
|
710
|
-
'completeness': 95.0,
|
|
711
|
-
'uniqueness': 99.0,
|
|
712
|
-
'validity': 98.0,
|
|
713
|
-
})
|
|
714
|
-
|
|
715
|
-
alerts = alert_system.check_and_alert({
|
|
716
|
-
'completeness': 92.5,
|
|
717
|
-
'uniqueness': 99.5,
|
|
718
|
-
'validity': 97.0,
|
|
719
|
-
})
|
|
720
|
-
|
|
721
|
-
for alert in alerts:
|
|
722
|
-
print(alert)
|
|
723
|
-
# 发送通知(Slack/Email/PagerDuty)
|
|
724
|
-
```
|
|
725
|
-
|
|
726
|
-
## Soda Core 集成
|
|
727
|
-
|
|
728
|
-
### 安装和配置
|
|
729
|
-
|
|
730
|
-
```bash
|
|
731
|
-
# 安装
|
|
732
|
-
pip install soda-core-postgres
|
|
733
|
-
|
|
734
|
-
# 配置
|
|
735
|
-
# configuration.yml
|
|
736
|
-
data_source my_datasource:
|
|
737
|
-
type: postgres
|
|
738
|
-
host: localhost
|
|
739
|
-
port: 5432
|
|
740
|
-
username: user
|
|
741
|
-
password: pass
|
|
742
|
-
database: mydb
|
|
743
|
-
```
|
|
744
|
-
|
|
745
|
-
### Checks 定义
|
|
746
|
-
|
|
747
|
-
```yaml
|
|
748
|
-
# checks.yml
|
|
749
|
-
checks for users:
|
|
750
|
-
- row_count > 100
|
|
751
|
-
- missing_count(user_id) = 0
|
|
752
|
-
- missing_count(email) = 0
|
|
753
|
-
- duplicate_count(user_id) = 0
|
|
754
|
-
- duplicate_count(email) = 0
|
|
755
|
-
- invalid_count(email) = 0:
|
|
756
|
-
valid format: email
|
|
757
|
-
- invalid_count(age) = 0:
|
|
758
|
-
valid min: 0
|
|
759
|
-
valid max: 120
|
|
760
|
-
- values in (status) must be in ['active', 'inactive', 'pending']
|
|
761
|
-
- freshness(created_at) < 1d
|
|
762
|
-
```
|
|
763
|
-
|
|
764
|
-
### 执行检查
|
|
765
|
-
|
|
766
|
-
```python
|
|
767
|
-
from soda.scan import Scan
|
|
768
|
-
|
|
769
|
-
# 创建扫描
|
|
770
|
-
scan = Scan()
|
|
771
|
-
scan.set_data_source_name("my_datasource")
|
|
772
|
-
scan.add_configuration_yaml_file("configuration.yml")
|
|
773
|
-
scan.add_sodacl_yaml_file("checks.yml")
|
|
774
|
-
|
|
775
|
-
# 执行扫描
|
|
776
|
-
scan.execute()
|
|
777
|
-
|
|
778
|
-
# 检查结果
|
|
779
|
-
if scan.has_check_fails():
|
|
780
|
-
print("Quality checks failed!")
|
|
781
|
-
for check in scan.get_checks_fail():
|
|
782
|
-
print(f" - {check}")
|
|
783
|
-
else:
|
|
784
|
-
print("All quality checks passed!")
|
|
785
|
-
```
|
|
786
|
-
|
|
787
|
-
## 最佳实践
|
|
788
|
-
|
|
789
|
-
### 分层验证策略
|
|
790
|
-
|
|
791
|
-
```python
|
|
792
|
-
# 1. 源数据验证
|
|
793
|
-
def validate_source(df: pd.DataFrame):
|
|
794
|
-
"""源数据验证"""
|
|
795
|
-
assert not df.empty, "Source data is empty"
|
|
796
|
-
assert df['id'].is_unique, "Duplicate IDs in source"
|
|
797
|
-
|
|
798
|
-
# 2. 转换验证
|
|
799
|
-
def validate_transformation(input_df: pd.DataFrame, output_df: pd.DataFrame):
|
|
800
|
-
"""转换验证"""
|
|
801
|
-
assert len(output_df) <= len(input_df), "Row count increased"
|
|
802
|
-
assert set(output_df['id']).issubset(set(input_df['id'])), "New IDs appeared"
|
|
803
|
-
|
|
804
|
-
# 3. 目标验证
|
|
805
|
-
def validate_target(df: pd.DataFrame):
|
|
806
|
-
"""目标验证"""
|
|
807
|
-
assert df['amount'].sum() > 0, "Total amount is zero"
|
|
808
|
-
assert df['date'].max() >= pd.Timestamp.now() - pd.Timedelta(days=1), "Data is stale"
|
|
809
|
-
```
|
|
810
|
-
|
|
811
|
-
### 持续质量监控
|
|
812
|
-
|
|
813
|
-
```python
|
|
814
|
-
import schedule
|
|
815
|
-
import time
|
|
816
|
-
|
|
817
|
-
def run_quality_checks():
|
|
818
|
-
"""运行质量检查"""
|
|
819
|
-
df = load_data()
|
|
820
|
-
|
|
821
|
-
metrics = {
|
|
822
|
-
'completeness': calculate_completeness(df),
|
|
823
|
-
'validity': calculate_validity(df),
|
|
824
|
-
'timeliness': calculate_timeliness(df),
|
|
825
|
-
}
|
|
826
|
-
|
|
827
|
-
# 记录指标
|
|
828
|
-
log_metrics(metrics)
|
|
829
|
-
|
|
830
|
-
# 检查告警
|
|
831
|
-
alerts = check_alerts(metrics)
|
|
832
|
-
if alerts:
|
|
833
|
-
send_notifications(alerts)
|
|
834
|
-
|
|
835
|
-
# 定时执行
|
|
836
|
-
schedule.every(1).hours.do(run_quality_checks)
|
|
837
|
-
|
|
838
|
-
while True:
|
|
839
|
-
schedule.run_pending()
|
|
840
|
-
time.sleep(60)
|
|
841
|
-
```
|
|
842
|
-
|
|
843
|
-
### 质量报告生成
|
|
844
|
-
|
|
845
|
-
```python
|
|
846
|
-
def generate_quality_report(df: pd.DataFrame) -> str:
|
|
847
|
-
"""生成质量报告"""
|
|
848
|
-
report = []
|
|
849
|
-
|
|
850
|
-
report.append("# Data Quality Report")
|
|
851
|
-
report.append(f"Generated at: {datetime.now()}")
|
|
852
|
-
report.append(f"\n## Dataset Overview")
|
|
853
|
-
report.append(f"- Total Rows: {len(df)}")
|
|
854
|
-
report.append(f"- Total Columns: {len(df.columns)}")
|
|
855
|
-
|
|
856
|
-
report.append(f"\n## Completeness")
|
|
857
|
-
null_counts = df.isnull().sum()
|
|
858
|
-
for col, count in null_counts.items():
|
|
859
|
-
if count > 0:
|
|
860
|
-
pct = (count / len(df)) * 100
|
|
861
|
-
report.append(f"- {col}: {count} nulls ({pct:.2f}%)")
|
|
862
|
-
|
|
863
|
-
report.append(f"\n## Duplicates")
|
|
864
|
-
duplicates = df.duplicated().sum()
|
|
865
|
-
report.append(f"- Total Duplicates: {duplicates}")
|
|
866
|
-
|
|
867
|
-
return "\n".join(report)
|
|
868
|
-
```
|
|
869
|
-
|
|
870
|
-
## 工具对比
|
|
871
|
-
|
|
872
|
-
| 工具 | 优势 | 适用场景 |
|
|
873
|
-
|------|------|----------|
|
|
874
|
-
| Great Expectations | 丰富的 Expectations、Data Docs | Python 生态、复杂验证 |
|
|
875
|
-
| dbt | SQL 原生、血缘追踪 | 数据仓库、转换测试 |
|
|
876
|
-
| Soda Core | 简洁的 YAML 配置 | 快速验证、CI/CD |
|
|
877
|
-
| Apache Griffin | 大数据质量 | Hadoop/Spark 生态 |
|
|
878
|
-
| Deequ | Spark 原生 | 大规模数据验证 |
|
|
879
|
-
|
|
880
|
-
## 工具清单
|
|
881
|
-
|
|
882
|
-
| 工具 | 用途 | 推荐场景 |
|
|
883
|
-
|------|------|----------|
|
|
884
|
-
| Great Expectations | 数据验证框架 | Python 数据管道 |
|
|
885
|
-
| dbt | 数据转换测试 | SQL 数据仓库 |
|
|
886
|
-
| Soda Core | 数据质量检查 | 轻量级验证 |
|
|
887
|
-
| Apache Griffin | 大数据质量 | Hadoop 生态 |
|
|
888
|
-
| Deequ | Spark 数据质量 | 大规模数据 |
|
|
889
|
-
| Monte Carlo | 数据可观测性 | 企业级监控 |
|
|
890
|
-
| Datafold | 数据 Diff | 变更验证 |
|
|
891
|
-
|
|
892
|
-
## 触发词
|
|
893
|
-
|
|
894
|
-
数据质量、Great Expectations、dbt、数据验证、数据测试、完整性、准确性、一致性、数据血缘、质量监控
|