gitlab_quality-test_tooling 3.7.0 → 3.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/gitlab_quality/test_tooling/code_coverage/click_house/category_owners_table.rb +3 -3
- data/lib/gitlab_quality/test_tooling/code_coverage/click_house/test_file_mappings_table.rb +142 -0
- data/lib/gitlab_quality/test_tooling/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0eb24e5b8b53d2478c3994cd26ca10425b01e0d7d31b51e125fedeb799d2b420
|
|
4
|
+
data.tar.gz: 374324661a08dafc46f7b92cd63136878bbeca121f5c4293bec7b4bb22ef7b1e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5803a5c51bdb252e3623379ef3fe0d791c344463ded48a47e568381a5014282f56d62e328b85cf7746940736130223585ced38a6a68747a47c83ae5fef7265df
|
|
7
|
+
data.tar.gz: 63df90311f5de1f0906f46402299e156c596eea5f78bc466c258cd9bda8aff961b01fb73f415f9d5ea56320ae950759659e98d46fb8cdd1b8d51c71b44d51478
|
data/Gemfile.lock
CHANGED
|
@@ -13,13 +13,13 @@ module GitlabQuality
|
|
|
13
13
|
|
|
14
14
|
KNOWN_UNOWNED = %w[shared not_owned tooling].freeze
|
|
15
15
|
|
|
16
|
-
# SQL query to get the latest ownership record for each category
|
|
17
|
-
#
|
|
16
|
+
# SQL query to get the latest ownership record for each unique category+ownership combination
|
|
17
|
+
# Partitions by the full composite key to handle cases where a category has multiple ownerships
|
|
18
18
|
LATEST_RECORDS_QUERY = <<~SQL
|
|
19
19
|
SELECT category, group, stage, section
|
|
20
20
|
FROM (
|
|
21
21
|
SELECT category, group, stage, section,
|
|
22
|
-
ROW_NUMBER() OVER (PARTITION BY category ORDER BY timestamp DESC) as rn
|
|
22
|
+
ROW_NUMBER() OVER (PARTITION BY category, group, stage, section ORDER BY timestamp DESC) as rn
|
|
23
23
|
FROM %{table_name}
|
|
24
24
|
)
|
|
25
25
|
WHERE rn = 1
|
|
@@ -9,6 +9,36 @@ module GitlabQuality
|
|
|
9
9
|
class TestFileMappingsTable < GitlabQuality::TestTooling::CodeCoverage::ClickHouse::Table
|
|
10
10
|
TABLE_NAME = "test_file_mappings"
|
|
11
11
|
|
|
12
|
+
# Override push to filter out duplicate mappings before inserting
|
|
13
|
+
# This prevents accumulating 36M duplicate rows per day in ReplacingMergeTree
|
|
14
|
+
#
|
|
15
|
+
# @param data [Array<Hash>] Code coverage related data to be pushed to ClickHouse
|
|
16
|
+
# @return [nil]
|
|
17
|
+
def push(data) # rubocop:disable Metrics/AbcSize
|
|
18
|
+
return logger.warn("#{LOG_PREFIX} No data found, skipping ClickHouse export!") if data.empty?
|
|
19
|
+
|
|
20
|
+
logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
|
|
21
|
+
sanitized_data = sanitize(data)
|
|
22
|
+
|
|
23
|
+
return logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?
|
|
24
|
+
|
|
25
|
+
# Filter out records that already exist with identical values
|
|
26
|
+
new_or_changed_records = filter_duplicates(sanitized_data)
|
|
27
|
+
|
|
28
|
+
if new_or_changed_records.empty?
|
|
29
|
+
logger.info("#{LOG_PREFIX} All #{sanitized_data.size} mappings already exist with same values, skipping insert to #{full_table_name}")
|
|
30
|
+
return
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
client.insert_json_data(table_name, new_or_changed_records)
|
|
34
|
+
skipped_count = sanitized_data.size - new_or_changed_records.size
|
|
35
|
+
logger.info("#{LOG_PREFIX} Successfully pushed #{new_or_changed_records.size} new/changed records " \
|
|
36
|
+
"to #{full_table_name} (skipped #{skipped_count} duplicates)")
|
|
37
|
+
rescue StandardError => e
|
|
38
|
+
logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
|
|
39
|
+
raise
|
|
40
|
+
end
|
|
41
|
+
|
|
12
42
|
private
|
|
13
43
|
|
|
14
44
|
# @return [Boolean] True if the record is valid, false otherwise
|
|
@@ -38,6 +68,9 @@ module GitlabQuality
|
|
|
38
68
|
timestamp: time,
|
|
39
69
|
test_file: record[:test_file],
|
|
40
70
|
source_file: record[:source_file],
|
|
71
|
+
# CI_PROJECT_PATH is set by GitLab CI runner and considered trusted infrastructure input.
|
|
72
|
+
# In non-CI environments, this may be user-controlled, but sanitize_for_clickhouse
|
|
73
|
+
# provides SQL injection protection.
|
|
41
74
|
ci_project_path: ENV.fetch('CI_PROJECT_PATH', nil),
|
|
42
75
|
category: record[:category] || '',
|
|
43
76
|
group: record[:group] || '',
|
|
@@ -45,6 +78,115 @@ module GitlabQuality
|
|
|
45
78
|
section: record[:section] || ''
|
|
46
79
|
}
|
|
47
80
|
end
|
|
81
|
+
|
|
82
|
+
# Filter out mappings that already exist in ClickHouse with identical enriched column values
|
|
83
|
+
# Only returns records that are:
|
|
84
|
+
# - New mappings (test_file + source_file combination doesn't exist), OR
|
|
85
|
+
# - Changed mappings (enriched columns category/group/stage/section differ from existing)
|
|
86
|
+
#
|
|
87
|
+
# This prevents inserting 36M duplicate rows per day while still allowing updates when
|
|
88
|
+
# ownership (category/group/stage/section) changes over time.
|
|
89
|
+
#
|
|
90
|
+
# @param records [Array<Hash>] Sanitized records to insert
|
|
91
|
+
# @return [Array<Hash>] Records that are new or have changed values
|
|
92
|
+
def filter_duplicates(records) # rubocop:disable Metrics/AbcSize
|
|
93
|
+
return records if records.empty?
|
|
94
|
+
|
|
95
|
+
# Assumes all records belong to the same project (batch from single pipeline run)
|
|
96
|
+
ci_project_path = records.first[:ci_project_path]
|
|
97
|
+
return records if ci_project_path.blank? # Can't query without project path
|
|
98
|
+
|
|
99
|
+
logger.debug("#{LOG_PREFIX} Checking for duplicate mappings in #{full_table_name}")
|
|
100
|
+
|
|
101
|
+
# Build a hash of existing mappings with their current enriched column values
|
|
102
|
+
# Key: "test_file|source_file", Value: {category, group, stage, section}
|
|
103
|
+
existing_mappings = fetch_existing_mappings(ci_project_path, records)
|
|
104
|
+
|
|
105
|
+
# Filter records: keep only new or changed mappings
|
|
106
|
+
new_or_changed = records.select do |record|
|
|
107
|
+
key = "#{record[:test_file]}|#{record[:source_file]}"
|
|
108
|
+
existing = existing_mappings[key]
|
|
109
|
+
|
|
110
|
+
# New mapping - doesn't exist yet
|
|
111
|
+
next true if existing.nil?
|
|
112
|
+
|
|
113
|
+
# Existing mapping - check if enriched columns changed
|
|
114
|
+
existing[:category] != record[:category] ||
|
|
115
|
+
existing[:group] != record[:group] ||
|
|
116
|
+
existing[:stage] != record[:stage] ||
|
|
117
|
+
existing[:section] != record[:section]
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
logger.debug("#{LOG_PREFIX} Found #{new_or_changed.size} new/changed mappings out of #{records.size} total")
|
|
121
|
+
new_or_changed
|
|
122
|
+
rescue StandardError => e
|
|
123
|
+
# If duplicate detection fails, fall back to inserting all records (safer than failing completely)
|
|
124
|
+
logger.warn("#{LOG_PREFIX} Failed to check for duplicates: #{e.message}. Inserting all records.")
|
|
125
|
+
records
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Fetch existing mappings from ClickHouse for the given project
|
|
129
|
+
# Returns a hash mapping "test_file|source_file" => {category, group, stage, section}
|
|
130
|
+
#
|
|
131
|
+
# @param ci_project_path [String] The CI project path to filter by
|
|
132
|
+
# @param records [Array<Hash>] Records being inserted (used to limit query scope)
|
|
133
|
+
# @return [Hash] Existing mappings with their enriched column values
|
|
134
|
+
def fetch_existing_mappings(ci_project_path, records) # rubocop:disable Metrics/AbcSize
|
|
135
|
+
# Query for ALL latest mappings for this project
|
|
136
|
+
# We filter in Ruby rather than building dynamic SQL to avoid injection risks
|
|
137
|
+
sql = <<~SQL
|
|
138
|
+
SELECT
|
|
139
|
+
test_file,
|
|
140
|
+
source_file,
|
|
141
|
+
category,
|
|
142
|
+
`group`,
|
|
143
|
+
stage,
|
|
144
|
+
section
|
|
145
|
+
FROM #{full_table_name}
|
|
146
|
+
WHERE ci_project_path = {project_path:String}
|
|
147
|
+
AND (ci_project_path, test_file, source_file, timestamp) IN (
|
|
148
|
+
SELECT
|
|
149
|
+
ci_project_path,
|
|
150
|
+
test_file,
|
|
151
|
+
source_file,
|
|
152
|
+
MAX(timestamp) AS timestamp
|
|
153
|
+
FROM #{full_table_name}
|
|
154
|
+
WHERE ci_project_path = {project_path:String}
|
|
155
|
+
GROUP BY ci_project_path, test_file, source_file
|
|
156
|
+
)
|
|
157
|
+
SQL
|
|
158
|
+
|
|
159
|
+
# Substitute project_path value using manual escaping (client doesn't support parameterized queries)
|
|
160
|
+
sql_with_params = sql.strip.gsub('{project_path:String}', "'#{sanitize_for_clickhouse(ci_project_path)}'")
|
|
161
|
+
results = client.query(sql_with_params)
|
|
162
|
+
|
|
163
|
+
# Build lookup hash, filtering to only the test_files we're checking if dataset is small
|
|
164
|
+
test_files_set = records.to_set { |r| r[:test_file] } if records.size <= 10_000
|
|
165
|
+
|
|
166
|
+
results.each_with_object({}) do |row, hash|
|
|
167
|
+
# Skip if we're doing selective filtering and this test_file isn't in our set
|
|
168
|
+
next if test_files_set && !test_files_set.include?(row['test_file'])
|
|
169
|
+
|
|
170
|
+
key = "#{row['test_file']}|#{row['source_file']}"
|
|
171
|
+
hash[key] = {
|
|
172
|
+
category: row['category'],
|
|
173
|
+
group: row['group'],
|
|
174
|
+
stage: row['stage'],
|
|
175
|
+
section: row['section']
|
|
176
|
+
}
|
|
177
|
+
end
|
|
178
|
+
rescue StandardError => e
|
|
179
|
+
logger.warn("#{LOG_PREFIX} Failed to fetch existing mappings: #{e.message}")
|
|
180
|
+
{} # Return empty hash on error (will treat all records as new)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Sanitize string for ClickHouse by escaping single quotes and backslashes
|
|
184
|
+
# This protects against SQL injection when we must use string interpolation
|
|
185
|
+
# @param str [String] String to sanitize
|
|
186
|
+
# @return [String] Sanitized string
|
|
187
|
+
def sanitize_for_clickhouse(str)
|
|
188
|
+
str.to_s.gsub(/\\/, '\\\\\\\\').gsub("'", "''") # rubocop:disable Style/RedundantRegexpArgument
|
|
189
|
+
end
|
|
48
190
|
end
|
|
49
191
|
end
|
|
50
192
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gitlab_quality-test_tooling
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.7.
|
|
4
|
+
version: 3.7.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- GitLab Quality
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-10 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: climate_control
|