code-abyss 1.6.16 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/README.md +8 -6
  2. package/bin/install.js +59 -163
  3. package/bin/lib/ccline.js +82 -0
  4. package/bin/lib/utils.js +61 -0
  5. package/package.json +5 -2
  6. package/skills/SKILL.md +24 -16
  7. package/skills/domains/ai/SKILL.md +2 -2
  8. package/skills/domains/ai/prompt-and-eval.md +279 -0
  9. package/skills/domains/architecture/SKILL.md +2 -3
  10. package/skills/domains/architecture/security-arch.md +87 -0
  11. package/skills/domains/data-engineering/SKILL.md +188 -26
  12. package/skills/domains/development/SKILL.md +1 -4
  13. package/skills/domains/devops/SKILL.md +3 -5
  14. package/skills/domains/devops/performance.md +63 -0
  15. package/skills/domains/devops/testing.md +97 -0
  16. package/skills/domains/frontend-design/SKILL.md +12 -3
  17. package/skills/domains/frontend-design/claymorphism/SKILL.md +117 -0
  18. package/skills/domains/frontend-design/claymorphism/references/tokens.css +52 -0
  19. package/skills/domains/frontend-design/engineering.md +287 -0
  20. package/skills/domains/frontend-design/glassmorphism/SKILL.md +138 -0
  21. package/skills/domains/frontend-design/glassmorphism/references/tokens.css +32 -0
  22. package/skills/domains/frontend-design/liquid-glass/SKILL.md +135 -0
  23. package/skills/domains/frontend-design/liquid-glass/references/tokens.css +81 -0
  24. package/skills/domains/frontend-design/neubrutalism/SKILL.md +141 -0
  25. package/skills/domains/frontend-design/neubrutalism/references/tokens.css +44 -0
  26. package/skills/domains/infrastructure/SKILL.md +174 -34
  27. package/skills/domains/mobile/SKILL.md +211 -21
  28. package/skills/domains/orchestration/SKILL.md +1 -0
  29. package/skills/domains/security/SKILL.md +4 -6
  30. package/skills/domains/security/blue-team.md +57 -0
  31. package/skills/domains/security/red-team.md +54 -0
  32. package/skills/domains/security/threat-intel.md +50 -0
  33. package/skills/orchestration/multi-agent/SKILL.md +195 -46
  34. package/skills/run_skill.js +139 -0
  35. package/skills/tools/gen-docs/SKILL.md +6 -4
  36. package/skills/tools/gen-docs/scripts/doc_generator.js +363 -0
  37. package/skills/tools/lib/shared.js +98 -0
  38. package/skills/tools/verify-change/SKILL.md +8 -6
  39. package/skills/tools/verify-change/scripts/change_analyzer.js +289 -0
  40. package/skills/tools/verify-module/SKILL.md +6 -4
  41. package/skills/tools/verify-module/scripts/module_scanner.js +171 -0
  42. package/skills/tools/verify-quality/SKILL.md +5 -3
  43. package/skills/tools/verify-quality/scripts/quality_checker.js +337 -0
  44. package/skills/tools/verify-security/SKILL.md +7 -5
  45. package/skills/tools/verify-security/scripts/security_scanner.js +283 -0
  46. package/skills/__pycache__/run_skill.cpython-312.pyc +0 -0
  47. package/skills/domains/COVERAGE_PLAN.md +0 -232
  48. package/skills/domains/ai/model-evaluation.md +0 -790
  49. package/skills/domains/ai/prompt-engineering.md +0 -703
  50. package/skills/domains/architecture/compliance.md +0 -299
  51. package/skills/domains/architecture/data-security.md +0 -184
  52. package/skills/domains/data-engineering/data-pipeline.md +0 -762
  53. package/skills/domains/data-engineering/data-quality.md +0 -894
  54. package/skills/domains/data-engineering/stream-processing.md +0 -791
  55. package/skills/domains/development/dart.md +0 -963
  56. package/skills/domains/development/kotlin.md +0 -834
  57. package/skills/domains/development/php.md +0 -659
  58. package/skills/domains/development/swift.md +0 -755
  59. package/skills/domains/devops/e2e-testing.md +0 -914
  60. package/skills/domains/devops/performance-testing.md +0 -734
  61. package/skills/domains/devops/testing-strategy.md +0 -667
  62. package/skills/domains/frontend-design/build-tools.md +0 -743
  63. package/skills/domains/frontend-design/performance.md +0 -734
  64. package/skills/domains/frontend-design/testing.md +0 -699
  65. package/skills/domains/infrastructure/gitops.md +0 -735
  66. package/skills/domains/infrastructure/iac.md +0 -855
  67. package/skills/domains/infrastructure/kubernetes.md +0 -1018
  68. package/skills/domains/mobile/android-dev.md +0 -979
  69. package/skills/domains/mobile/cross-platform.md +0 -795
  70. package/skills/domains/mobile/ios-dev.md +0 -931
  71. package/skills/domains/security/secrets-management.md +0 -834
  72. package/skills/domains/security/supply-chain.md +0 -931
  73. package/skills/domains/security/threat-modeling.md +0 -828
  74. package/skills/run_skill.py +0 -153
  75. package/skills/tests/README.md +0 -225
  76. package/skills/tests/SUMMARY.md +0 -362
  77. package/skills/tests/__init__.py +0 -3
  78. package/skills/tests/__pycache__/test_change_analyzer.cpython-312.pyc +0 -0
  79. package/skills/tests/__pycache__/test_doc_generator.cpython-312.pyc +0 -0
  80. package/skills/tests/__pycache__/test_module_scanner.cpython-312.pyc +0 -0
  81. package/skills/tests/__pycache__/test_quality_checker.cpython-312.pyc +0 -0
  82. package/skills/tests/__pycache__/test_security_scanner.cpython-312.pyc +0 -0
  83. package/skills/tests/test_change_analyzer.py +0 -558
  84. package/skills/tests/test_doc_generator.py +0 -538
  85. package/skills/tests/test_module_scanner.py +0 -376
  86. package/skills/tests/test_quality_checker.py +0 -516
  87. package/skills/tests/test_security_scanner.py +0 -426
  88. package/skills/tools/gen-docs/scripts/__pycache__/doc_generator.cpython-312.pyc +0 -0
  89. package/skills/tools/gen-docs/scripts/doc_generator.py +0 -520
  90. package/skills/tools/verify-change/scripts/__pycache__/change_analyzer.cpython-312.pyc +0 -0
  91. package/skills/tools/verify-change/scripts/change_analyzer.py +0 -529
  92. package/skills/tools/verify-module/scripts/__pycache__/module_scanner.cpython-312.pyc +0 -0
  93. package/skills/tools/verify-module/scripts/module_scanner.py +0 -321
  94. package/skills/tools/verify-quality/scripts/__pycache__/quality_checker.cpython-312.pyc +0 -0
  95. package/skills/tools/verify-quality/scripts/quality_checker.py +0 -481
  96. package/skills/tools/verify-security/scripts/__pycache__/security_scanner.cpython-312.pyc +0 -0
  97. package/skills/tools/verify-security/scripts/security_scanner.py +0 -374
@@ -1,894 +0,0 @@
1
- ---
2
- name: data-quality
3
- description: 数据质量保障。Great Expectations、dbt、数据验证、数据测试、数据血缘、完整性检查。当用户提到数据质量、Great Expectations、dbt、数据验证、数据测试时使用。
4
- ---
5
-
6
- # 🎯 数据质量秘典 · Data Quality
7
-
8
- ## 质量维度
9
-
10
- ```
11
- 完整性 → 准确性 → 一致性 → 及时性 → 有效性
12
- │ │ │ │ │
13
- └─ 非空 ─┴─ 范围 ─┴─ 关联 ─┴─ 新鲜度 ─┴─ 格式
14
- ```
15
-
16
- ## Great Expectations 基础
17
-
18
- ### 安装和初始化
19
-
20
- ```bash
21
- # 安装
22
- pip install great_expectations
23
-
24
- # 初始化项目
25
- great_expectations init
26
-
27
- # 项目结构
28
- great_expectations/
29
- ├── great_expectations.yml
30
- ├── expectations/
31
- ├── checkpoints/
32
- ├── plugins/
33
- └── uncommitted/
34
- ```
35
-
36
- ### 创建 Data Context
37
-
38
- ```python
39
- import great_expectations as gx
40
- from great_expectations.data_context import FileDataContext
41
-
42
- # 获取 Data Context
43
- context = gx.get_context()
44
-
45
- # 添加数据源
46
- datasource = context.sources.add_pandas("my_datasource")
47
-
48
- # 添加数据资产
49
- data_asset = datasource.add_dataframe_asset(name="users_df")
50
-
51
- # 构建批次请求
52
- batch_request = data_asset.build_batch_request(dataframe=df)
53
- ```
54
-
55
- ### Expectations 定义
56
-
57
- ```python
58
- import pandas as pd
59
- import great_expectations as gx
60
-
61
- # 创建 Validator
62
- context = gx.get_context()
63
- validator = context.sources.pandas_default.read_dataframe(df)
64
-
65
- # 基础 Expectations
66
- validator.expect_table_row_count_to_be_between(min_value=100, max_value=10000)
67
- validator.expect_table_column_count_to_equal(value=5)
68
-
69
- # 列存在性
70
- validator.expect_column_to_exist(column="user_id")
71
- validator.expect_column_to_exist(column="email")
72
-
73
- # 非空检查
74
- validator.expect_column_values_to_not_be_null(column="user_id")
75
- validator.expect_column_values_to_not_be_null(column="email")
76
-
77
- # 唯一性检查
78
- validator.expect_column_values_to_be_unique(column="user_id")
79
- validator.expect_column_values_to_be_unique(column="email")
80
-
81
- # 值范围检查
82
- validator.expect_column_values_to_be_between(
83
- column="age",
84
- min_value=0,
85
- max_value=120
86
- )
87
-
88
- # 值集合检查
89
- validator.expect_column_values_to_be_in_set(
90
- column="status",
91
- value_set=["active", "inactive", "pending"]
92
- )
93
-
94
- # 正则表达式检查
95
- validator.expect_column_values_to_match_regex(
96
- column="email",
97
- regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
98
- )
99
-
100
- # 类型检查
101
- validator.expect_column_values_to_be_of_type(
102
- column="age",
103
- type_="int64"
104
- )
105
-
106
- # 保存 Expectation Suite
107
- validator.save_expectation_suite(discard_failed_expectations=False)
108
- ```
109
-
110
- ### 自定义 Expectations
111
-
112
- ```python
113
- from great_expectations.expectations.expectation import ColumnMapExpectation
114
- from great_expectations.execution_engine import PandasExecutionEngine
115
-
116
- class ExpectColumnValuesToBeValidPhoneNumber(ColumnMapExpectation):
117
- """期望列值为有效电话号码"""
118
-
119
- map_metric = "column_values.match_phone_pattern"
120
-
121
- @classmethod
122
- def _atomic_prescriptive_template(cls, **kwargs):
123
- return "values must be valid phone numbers"
124
-
125
- @classmethod
126
- def _prescriptive_template(cls, **kwargs):
127
- return "At least $mostly_pct % of values in $column must be valid phone numbers"
128
-
129
- # 注册自定义 Expectation
130
- validator.expect_column_values_to_be_valid_phone_number(
131
- column="phone",
132
- mostly=0.95
133
- )
134
- ```
135
-
136
- ### Checkpoints 执行
137
-
138
- ```python
139
- # 创建 Checkpoint
140
- checkpoint_config = {
141
- "name": "my_checkpoint",
142
- "config_version": 1.0,
143
- "class_name": "SimpleCheckpoint",
144
- "validations": [
145
- {
146
- "batch_request": {
147
- "datasource_name": "my_datasource",
148
- "data_asset_name": "users_df",
149
- },
150
- "expectation_suite_name": "users_suite",
151
- }
152
- ],
153
- }
154
-
155
- context.add_checkpoint(**checkpoint_config)
156
-
157
- # 运行 Checkpoint
158
- result = context.run_checkpoint(
159
- checkpoint_name="my_checkpoint",
160
- batch_request=batch_request,
161
- )
162
-
163
- # 检查结果
164
- if result["success"]:
165
- print("All expectations passed!")
166
- else:
167
- print("Some expectations failed:")
168
- for validation in result["run_results"].values():
169
- for result in validation["validation_result"]["results"]:
170
- if not result["success"]:
171
- print(f" - {result['expectation_config']['expectation_type']}")
172
- ```
173
-
174
- ### Data Docs 生成
175
-
176
- ```python
177
- # 构建 Data Docs
178
- context.build_data_docs()
179
-
180
- # 打开 Data Docs
181
- context.open_data_docs()
182
-
183
- # 自定义 Data Docs 站点
184
- data_docs_config = {
185
- "sites": {
186
- "local_site": {
187
- "class_name": "SiteBuilder",
188
- "store_backend": {
189
- "class_name": "TupleFilesystemStoreBackend",
190
- "base_directory": "uncommitted/data_docs/local_site/",
191
- },
192
- "site_index_builder": {
193
- "class_name": "DefaultSiteIndexBuilder",
194
- },
195
- }
196
- }
197
- }
198
- ```
199
-
200
- ## dbt 数据测试
201
-
202
- ### 项目结构
203
-
204
- ```yaml
205
- # dbt_project.yml
206
- name: 'my_project'
207
- version: '1.0.0'
208
- config-version: 2
209
-
210
- profile: 'default'
211
-
212
- model-paths: ["models"]
213
- test-paths: ["tests"]
214
- seed-paths: ["seeds"]
215
- macro-paths: ["macros"]
216
-
217
- models:
218
- my_project:
219
- +materialized: table
220
- ```
221
-
222
- ### Schema 测试
223
-
224
- ```yaml
225
- # models/schema.yml
226
- version: 2
227
-
228
- models:
229
- - name: users
230
- description: "User table"
231
- columns:
232
- - name: user_id
233
- description: "Primary key"
234
- tests:
235
- - unique
236
- - not_null
237
-
238
- - name: email
239
- description: "User email"
240
- tests:
241
- - unique
242
- - not_null
243
- - dbt_utils.email
244
-
245
- - name: age
246
- description: "User age"
247
- tests:
248
- - not_null
249
- - dbt_utils.accepted_range:
250
- min_value: 0
251
- max_value: 120
252
-
253
- - name: status
254
- description: "User status"
255
- tests:
256
- - not_null
257
- - accepted_values:
258
- values: ['active', 'inactive', 'pending']
259
-
260
- - name: created_at
261
- description: "Creation timestamp"
262
- tests:
263
- - not_null
264
- - dbt_utils.not_future_date
265
-
266
- - name: country_code
267
- description: "Country code"
268
- tests:
269
- - relationships:
270
- to: ref('countries')
271
- field: code
272
- ```
273
-
274
- ### 自定义 Data 测试
275
-
276
- ```sql
277
- -- tests/assert_positive_revenue.sql
278
- -- 测试收入必须为正数
279
-
280
- SELECT
281
- order_id,
282
- revenue
283
- FROM {{ ref('orders') }}
284
- WHERE revenue <= 0
285
- ```
286
-
287
- ```sql
288
- -- tests/assert_user_email_domain.sql
289
- -- 测试用户邮箱域名
290
-
291
- SELECT
292
- user_id,
293
- email
294
- FROM {{ ref('users') }}
295
- WHERE email NOT LIKE '%@company.com'
296
- AND email NOT LIKE '%@partner.com'
297
- ```
298
-
299
- ### Generic 测试
300
-
301
- ```sql
302
- -- macros/test_valid_date_range.sql
303
- {% test valid_date_range(model, column_name, start_date, end_date) %}
304
-
305
- SELECT *
306
- FROM {{ model }}
307
- WHERE {{ column_name }} < '{{ start_date }}'
308
- OR {{ column_name }} > '{{ end_date }}'
309
-
310
- {% endtest %}
311
- ```
312
-
313
- ```yaml
314
- # 使用 Generic 测试
315
- models:
316
- - name: events
317
- columns:
318
- - name: event_date
319
- tests:
320
- - valid_date_range:
321
- start_date: '2020-01-01'
322
- end_date: '2025-12-31'
323
- ```
324
-
325
- ### Singular 测试
326
-
327
- ```sql
328
- -- tests/assert_revenue_consistency.sql
329
- -- 测试收入一致性
330
-
331
- WITH order_revenue AS (
332
- SELECT SUM(amount) AS total
333
- FROM {{ ref('orders') }}
334
- ),
335
- payment_revenue AS (
336
- SELECT SUM(amount) AS total
337
- FROM {{ ref('payments') }}
338
- )
339
-
340
- SELECT
341
- o.total AS order_total,
342
- p.total AS payment_total,
343
- ABS(o.total - p.total) AS difference
344
- FROM order_revenue o
345
- CROSS JOIN payment_revenue p
346
- WHERE ABS(o.total - p.total) > 0.01
347
- ```
348
-
349
- ### dbt 测试执行
350
-
351
- ```bash
352
- # 运行所有测试
353
- dbt test
354
-
355
- # 运行特定模型的测试
356
- dbt test --select users
357
-
358
- # 运行特定测试
359
- dbt test --select test_name:unique_users_user_id
360
-
361
- # 运行失败的测试
362
- dbt test --select result:fail
363
-
364
- # 存储测试失败记录
365
- dbt test --store-failures
366
- ```
367
-
368
- ### dbt Expectations 包
369
-
370
- ```yaml
371
- # packages.yml
372
- packages:
373
- - package: calogica/dbt_expectations
374
- version: 0.9.0
375
- ```
376
-
377
- ```yaml
378
- # 使用 dbt_expectations
379
- models:
380
- - name: users
381
- columns:
382
- - name: email
383
- tests:
384
- - dbt_expectations.expect_column_values_to_match_regex:
385
- regex: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
386
-
387
- - name: age
388
- tests:
389
- - dbt_expectations.expect_column_mean_to_be_between:
390
- min_value: 18
391
- max_value: 65
392
-
393
- - name: created_at
394
- tests:
395
- - dbt_expectations.expect_row_values_to_have_recent_data:
396
- datepart: day
397
- interval: 7
398
- ```
399
-
400
- ## 数据验证规则
401
-
402
- ### 完整性检查
403
-
404
- ```python
405
- import pandas as pd
406
-
407
- def check_completeness(df: pd.DataFrame, required_columns: list) -> dict:
408
- """检查数据完整性"""
409
- results = {}
410
-
411
- # 检查必需列
412
- missing_columns = set(required_columns) - set(df.columns)
413
- results['missing_columns'] = list(missing_columns)
414
-
415
- # 检查空值
416
- null_counts = df[required_columns].isnull().sum()
417
- results['null_counts'] = null_counts.to_dict()
418
-
419
- # 检查空字符串
420
- for col in required_columns:
421
- if df[col].dtype == 'object':
422
- empty_count = (df[col] == '').sum()
423
- results[f'{col}_empty_count'] = empty_count
424
-
425
- return results
426
-
427
- # 使用示例
428
- required_cols = ['user_id', 'email', 'name']
429
- completeness = check_completeness(df, required_cols)
430
- ```
431
-
432
- ### 准确性检查
433
-
434
- ```python
435
- def check_accuracy(df: pd.DataFrame) -> dict:
436
- """检查数据准确性"""
437
- results = {}
438
-
439
- # 数值范围检查
440
- if 'age' in df.columns:
441
- invalid_age = df[(df['age'] < 0) | (df['age'] > 120)]
442
- results['invalid_age_count'] = len(invalid_age)
443
-
444
- # 格式检查
445
- if 'email' in df.columns:
446
- email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
447
- invalid_email = df[~df['email'].str.match(email_pattern, na=False)]
448
- results['invalid_email_count'] = len(invalid_email)
449
-
450
- # 逻辑检查
451
- if 'start_date' in df.columns and 'end_date' in df.columns:
452
- invalid_dates = df[df['start_date'] > df['end_date']]
453
- results['invalid_date_range_count'] = len(invalid_dates)
454
-
455
- return results
456
- ```
457
-
458
- ### 一致性检查
459
-
460
- ```python
461
- def check_consistency(df1: pd.DataFrame, df2: pd.DataFrame, key: str) -> dict:
462
- """检查数据一致性"""
463
- results = {}
464
-
465
- # 主键一致性
466
- keys1 = set(df1[key])
467
- keys2 = set(df2[key])
468
-
469
- results['only_in_df1'] = len(keys1 - keys2)
470
- results['only_in_df2'] = len(keys2 - keys1)
471
- results['in_both'] = len(keys1 & keys2)
472
-
473
- # 值一致性
474
- merged = df1.merge(df2, on=key, suffixes=('_1', '_2'))
475
- for col in df1.columns:
476
- if col != key and f'{col}_2' in merged.columns:
477
- inconsistent = merged[merged[f'{col}_1'] != merged[f'{col}_2']]
478
- results[f'{col}_inconsistent_count'] = len(inconsistent)
479
-
480
- return results
481
- ```
482
-
483
- ### 及时性检查
484
-
485
- ```python
486
- from datetime import datetime, timedelta
487
-
488
- def check_timeliness(df: pd.DataFrame, timestamp_col: str, max_age_hours: int = 24) -> dict:
489
- """检查数据及时性"""
490
- results = {}
491
-
492
- df[timestamp_col] = pd.to_datetime(df[timestamp_col])
493
- now = datetime.now()
494
- threshold = now - timedelta(hours=max_age_hours)
495
-
496
- # 过期数据
497
- stale_data = df[df[timestamp_col] < threshold]
498
- results['stale_count'] = len(stale_data)
499
- results['stale_percentage'] = len(stale_data) / len(df) * 100
500
-
501
- # 最新数据时间
502
- results['latest_timestamp'] = df[timestamp_col].max()
503
- results['oldest_timestamp'] = df[timestamp_col].min()
504
- results['data_age_hours'] = (now - df[timestamp_col].max()).total_seconds() / 3600
505
-
506
- return results
507
- ```
508
-
509
- ## 数据血缘追踪
510
-
511
- ### dbt 血缘
512
-
513
- ```sql
514
- -- models/staging/stg_users.sql
515
- SELECT
516
- user_id,
517
- email,
518
- created_at
519
- FROM {{ source('raw', 'users') }}
520
-
521
- -- models/marts/dim_users.sql
522
- SELECT
523
- user_id,
524
- email,
525
- DATE(created_at) AS created_date
526
- FROM {{ ref('stg_users') }}
527
-
528
- -- models/marts/fct_orders.sql
529
- SELECT
530
- o.order_id,
531
- u.user_id,
532
- o.amount
533
- FROM {{ ref('stg_orders') }} o
534
- LEFT JOIN {{ ref('dim_users') }} u
535
- ON o.user_id = u.user_id
536
- ```
537
-
538
- ```bash
539
- # 生成血缘图
540
- dbt docs generate
541
- dbt docs serve
542
-
543
- # 查看血缘关系
544
- # http://localhost:8080
545
- ```
546
-
547
- ### 自定义血缘追踪
548
-
549
- ```python
550
- from dataclasses import dataclass
551
- from typing import List, Dict
552
-
553
- @dataclass
554
- class DataLineage:
555
- """数据血缘"""
556
- table_name: str
557
- upstream_tables: List[str]
558
- transformation: str
559
- created_at: str
560
-
561
- class LineageTracker:
562
- """血缘追踪器"""
563
-
564
- def __init__(self):
565
- self.lineage: Dict[str, DataLineage] = {}
566
-
567
- def register(self, table_name: str, upstream: List[str], transformation: str):
568
- """注册血缘关系"""
569
- self.lineage[table_name] = DataLineage(
570
- table_name=table_name,
571
- upstream_tables=upstream,
572
- transformation=transformation,
573
- created_at=datetime.now().isoformat()
574
- )
575
-
576
- def get_upstream(self, table_name: str, recursive: bool = False) -> List[str]:
577
- """获取上游表"""
578
- if table_name not in self.lineage:
579
- return []
580
-
581
- upstream = self.lineage[table_name].upstream_tables
582
-
583
- if recursive:
584
- all_upstream = set(upstream)
585
- for table in upstream:
586
- all_upstream.update(self.get_upstream(table, recursive=True))
587
- return list(all_upstream)
588
-
589
- return upstream
590
-
591
- def get_downstream(self, table_name: str) -> List[str]:
592
- """获取下游表"""
593
- downstream = []
594
- for name, lineage in self.lineage.items():
595
- if table_name in lineage.upstream_tables:
596
- downstream.append(name)
597
- return downstream
598
-
599
- # 使用示例
600
- tracker = LineageTracker()
601
-
602
- tracker.register('stg_users', ['raw.users'], 'SELECT * FROM raw.users')
603
- tracker.register('dim_users', ['stg_users'], 'SELECT user_id, email FROM stg_users')
604
- tracker.register('fct_orders', ['stg_orders', 'dim_users'], 'JOIN transformation')
605
-
606
- print(tracker.get_upstream('fct_orders', recursive=True))
607
- # ['stg_orders', 'dim_users', 'stg_users', 'raw.users']
608
- ```
609
-
610
- ## 数据质量监控
611
-
612
- ### 质量指标计算
613
-
614
- ```python
615
- import pandas as pd
616
- from typing import Dict
617
-
618
- class DataQualityMetrics:
619
- """数据质量指标"""
620
-
621
- @staticmethod
622
- def calculate_completeness(df: pd.DataFrame) -> float:
623
- """完整性得分"""
624
- total_cells = df.size
625
- non_null_cells = df.count().sum()
626
- return (non_null_cells / total_cells) * 100
627
-
628
- @staticmethod
629
- def calculate_uniqueness(df: pd.DataFrame, key_columns: List[str]) -> float:
630
- """唯一性得分"""
631
- total_rows = len(df)
632
- unique_rows = df[key_columns].drop_duplicates().shape[0]
633
- return (unique_rows / total_rows) * 100
634
-
635
- @staticmethod
636
- def calculate_validity(df: pd.DataFrame, rules: Dict) -> float:
637
- """有效性得分"""
638
- total_rows = len(df)
639
- valid_rows = total_rows
640
-
641
- for column, rule in rules.items():
642
- if rule['type'] == 'range':
643
- invalid = df[
644
- (df[column] < rule['min']) | (df[column] > rule['max'])
645
- ]
646
- valid_rows -= len(invalid)
647
- elif rule['type'] == 'regex':
648
- invalid = df[~df[column].str.match(rule['pattern'], na=False)]
649
- valid_rows -= len(invalid)
650
-
651
- return (valid_rows / total_rows) * 100
652
-
653
- @staticmethod
654
- def calculate_overall_score(metrics: Dict[str, float]) -> float:
655
- """综合质量得分"""
656
- weights = {
657
- 'completeness': 0.3,
658
- 'uniqueness': 0.2,
659
- 'validity': 0.3,
660
- 'timeliness': 0.2,
661
- }
662
-
663
- score = sum(metrics.get(k, 0) * v for k, v in weights.items())
664
- return score
665
-
666
- # 使用示例
667
- metrics = DataQualityMetrics()
668
-
669
- completeness = metrics.calculate_completeness(df)
670
- uniqueness = metrics.calculate_uniqueness(df, ['user_id'])
671
- validity = metrics.calculate_validity(df, {
672
- 'age': {'type': 'range', 'min': 0, 'max': 120}
673
- })
674
-
675
- overall = metrics.calculate_overall_score({
676
- 'completeness': completeness,
677
- 'uniqueness': uniqueness,
678
- 'validity': validity,
679
- 'timeliness': 95.0,
680
- })
681
-
682
- print(f"Overall Quality Score: {overall:.2f}%")
683
- ```
684
-
685
- ### 质量告警
686
-
687
- ```python
688
- class QualityAlert:
689
- """质量告警"""
690
-
691
- def __init__(self, thresholds: Dict[str, float]):
692
- self.thresholds = thresholds
693
-
694
- def check_and_alert(self, metrics: Dict[str, float]) -> List[str]:
695
- """检查并生成告警"""
696
- alerts = []
697
-
698
- for metric, value in metrics.items():
699
- threshold = self.thresholds.get(metric)
700
- if threshold and value < threshold:
701
- alerts.append(
702
- f"ALERT: {metric} is {value:.2f}%, "
703
- f"below threshold {threshold}%"
704
- )
705
-
706
- return alerts
707
-
708
- # 使用示例
709
- alert_system = QualityAlert({
710
- 'completeness': 95.0,
711
- 'uniqueness': 99.0,
712
- 'validity': 98.0,
713
- })
714
-
715
- alerts = alert_system.check_and_alert({
716
- 'completeness': 92.5,
717
- 'uniqueness': 99.5,
718
- 'validity': 97.0,
719
- })
720
-
721
- for alert in alerts:
722
- print(alert)
723
- # 发送通知(Slack/Email/PagerDuty)
724
- ```
725
-
726
- ## Soda Core 集成
727
-
728
- ### 安装和配置
729
-
730
- ```bash
731
- # 安装
732
- pip install soda-core-postgres
733
-
734
- # 配置
735
- # configuration.yml
736
- data_source my_datasource:
737
- type: postgres
738
- host: localhost
739
- port: 5432
740
- username: user
741
- password: pass
742
- database: mydb
743
- ```
744
-
745
- ### Checks 定义
746
-
747
- ```yaml
748
- # checks.yml
749
- checks for users:
750
- - row_count > 100
751
- - missing_count(user_id) = 0
752
- - missing_count(email) = 0
753
- - duplicate_count(user_id) = 0
754
- - duplicate_count(email) = 0
755
- - invalid_count(email) = 0:
756
- valid format: email
757
- - invalid_count(age) = 0:
758
- valid min: 0
759
- valid max: 120
760
- - values in (status) must be in ['active', 'inactive', 'pending']
761
- - freshness(created_at) < 1d
762
- ```
763
-
764
- ### 执行检查
765
-
766
- ```python
767
- from soda.scan import Scan
768
-
769
- # 创建扫描
770
- scan = Scan()
771
- scan.set_data_source_name("my_datasource")
772
- scan.add_configuration_yaml_file("configuration.yml")
773
- scan.add_sodacl_yaml_file("checks.yml")
774
-
775
- # 执行扫描
776
- scan.execute()
777
-
778
- # 检查结果
779
- if scan.has_check_fails():
780
- print("Quality checks failed!")
781
- for check in scan.get_checks_fail():
782
- print(f" - {check}")
783
- else:
784
- print("All quality checks passed!")
785
- ```
786
-
787
- ## 最佳实践
788
-
789
- ### 分层验证策略
790
-
791
- ```python
792
- # 1. 源数据验证
793
- def validate_source(df: pd.DataFrame):
794
- """源数据验证"""
795
- assert not df.empty, "Source data is empty"
796
- assert df['id'].is_unique, "Duplicate IDs in source"
797
-
798
- # 2. 转换验证
799
- def validate_transformation(input_df: pd.DataFrame, output_df: pd.DataFrame):
800
- """转换验证"""
801
- assert len(output_df) <= len(input_df), "Row count increased"
802
- assert set(output_df['id']).issubset(set(input_df['id'])), "New IDs appeared"
803
-
804
- # 3. 目标验证
805
- def validate_target(df: pd.DataFrame):
806
- """目标验证"""
807
- assert df['amount'].sum() > 0, "Total amount is zero"
808
- assert df['date'].max() >= pd.Timestamp.now() - pd.Timedelta(days=1), "Data is stale"
809
- ```
810
-
811
- ### 持续质量监控
812
-
813
- ```python
814
- import schedule
815
- import time
816
-
817
- def run_quality_checks():
818
- """运行质量检查"""
819
- df = load_data()
820
-
821
- metrics = {
822
- 'completeness': calculate_completeness(df),
823
- 'validity': calculate_validity(df),
824
- 'timeliness': calculate_timeliness(df),
825
- }
826
-
827
- # 记录指标
828
- log_metrics(metrics)
829
-
830
- # 检查告警
831
- alerts = check_alerts(metrics)
832
- if alerts:
833
- send_notifications(alerts)
834
-
835
- # 定时执行
836
- schedule.every(1).hours.do(run_quality_checks)
837
-
838
- while True:
839
- schedule.run_pending()
840
- time.sleep(60)
841
- ```
842
-
843
- ### 质量报告生成
844
-
845
- ```python
846
- def generate_quality_report(df: pd.DataFrame) -> str:
847
- """生成质量报告"""
848
- report = []
849
-
850
- report.append("# Data Quality Report")
851
- report.append(f"Generated at: {datetime.now()}")
852
- report.append(f"\n## Dataset Overview")
853
- report.append(f"- Total Rows: {len(df)}")
854
- report.append(f"- Total Columns: {len(df.columns)}")
855
-
856
- report.append(f"\n## Completeness")
857
- null_counts = df.isnull().sum()
858
- for col, count in null_counts.items():
859
- if count > 0:
860
- pct = (count / len(df)) * 100
861
- report.append(f"- {col}: {count} nulls ({pct:.2f}%)")
862
-
863
- report.append(f"\n## Duplicates")
864
- duplicates = df.duplicated().sum()
865
- report.append(f"- Total Duplicates: {duplicates}")
866
-
867
- return "\n".join(report)
868
- ```
869
-
870
- ## 工具对比
871
-
872
- | 工具 | 优势 | 适用场景 |
873
- |------|------|----------|
874
- | Great Expectations | 丰富的 Expectations、Data Docs | Python 生态、复杂验证 |
875
- | dbt | SQL 原生、血缘追踪 | 数据仓库、转换测试 |
876
- | Soda Core | 简洁的 YAML 配置 | 快速验证、CI/CD |
877
- | Apache Griffin | 大数据质量 | Hadoop/Spark 生态 |
878
- | Deequ | Spark 原生 | 大规模数据验证 |
879
-
880
- ## 工具清单
881
-
882
- | 工具 | 用途 | 推荐场景 |
883
- |------|------|----------|
884
- | Great Expectations | 数据验证框架 | Python 数据管道 |
885
- | dbt | 数据转换测试 | SQL 数据仓库 |
886
- | Soda Core | 数据质量检查 | 轻量级验证 |
887
- | Apache Griffin | 大数据质量 | Hadoop 生态 |
888
- | Deequ | Spark 数据质量 | 大规模数据 |
889
- | Monte Carlo | 数据可观测性 | 企业级监控 |
890
- | Datafold | 数据 Diff | 变更验证 |
891
-
892
- ## 触发词
893
-
894
- 数据质量、Great Expectations、dbt、数据验证、数据测试、完整性、准确性、一致性、数据血缘、质量监控