gitlab_quality-test_tooling 3.6.1 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6922c9895f727e5c4ad2c7fd9ecc7ffa2c17f0de9125ae24def5627dc040d2f9
4
- data.tar.gz: 89a32f162c38d2098ffad75338adfb3ef3d90b773819ba06b03d03db111cd252
3
+ metadata.gz: e6358a99fbbffd427e08eb0d8fa81eb1b094d91967391b1dbfe9991f8ef754c2
4
+ data.tar.gz: baaab01155aba0a2a897d419036fa4bba9db8afec7e0138e71e1c78296a2e56a
5
5
  SHA512:
6
- metadata.gz: 1c216c46fe2a705acba58bee70576cc7e4a71773802bc58ac59f60ef3ca413186f4a3bf5255712986b8943ffe7248797ce1e9d35a078176fa76cad458ea95519
7
- data.tar.gz: 69c7c48e2270d86d89b6c12d5d11800079fe20af2c6024f65b8d2d35f89499a63816cb188a3c086520f571a9288cd51330849b115554904c7e802d9e2d2fff42
6
+ metadata.gz: 111fdd1f23adf766111ebbf0b31190f994f941f15fd4263cb2c9784307891cd34cc39a26636e9eccd6134651ab416396affeb6284bae584d6f6f23d3cf2d5d53
7
+ data.tar.gz: 03aaff6cbb173014e5c47373c0af889b2d86ceaf4ebaeb86c7e8ec92774864fac3adc914e3a11b4d1f76176c86b966e12910c78f852954d2b1650b980427a9eb
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gitlab_quality-test_tooling (3.6.1)
4
+ gitlab_quality-test_tooling (3.8.0)
5
5
  activesupport (>= 7.0)
6
6
  amatch (~> 0.4.1)
7
7
  fog-google (~> 1.24, >= 1.24.1)
data/README.md CHANGED
@@ -292,7 +292,7 @@ Options:
292
292
  --clickhouse-username USERNAME
293
293
  ClickHouse username
294
294
  --clickhouse-shared-database DATABASE
295
- ClickHouse shared database name (default: shared)
295
+ ClickHouse shared database name
296
296
  --responsibility-patterns PATH
297
297
  Path to YAML file with responsibility classification patterns
298
298
 
@@ -312,7 +312,7 @@ Usage: exe/sync-category-owners [options]
312
312
  Options:
313
313
  --clickhouse-url URL ClickHouse server URL
314
314
  --clickhouse-database DATABASE
315
- ClickHouse database name (default: shared)
315
+ ClickHouse database name
316
316
  --clickhouse-username USERNAME
317
317
  ClickHouse username
318
318
 
@@ -24,7 +24,7 @@ options = OptionParser.new do |opts|
24
24
  params[:clickhouse_url] = url
25
25
  end
26
26
 
27
- opts.on('--clickhouse-database DATABASE', 'ClickHouse database name (default: shared)') do |database|
27
+ opts.on('--clickhouse-database DATABASE', 'ClickHouse database name') do |database|
28
28
  params[:clickhouse_database] = database
29
29
  end
30
30
 
@@ -51,9 +51,6 @@ options = OptionParser.new do |opts|
51
51
  opts.parse(ARGV)
52
52
  end
53
53
 
54
- # Default database to 'shared' if not specified
55
- params[:clickhouse_database] ||= 'shared'
56
-
57
54
  if params.any? && (required_params - params.keys).none?
58
55
  clickhouse_password = ENV.fetch('GLCI_CLICKHOUSE_METRICS_PASSWORD', nil)
59
56
  if clickhouse_password.to_s.strip.empty?
@@ -88,10 +85,9 @@ if params.any? && (required_params - params.keys).none?
88
85
  password: clickhouse_password
89
86
  )
90
87
 
91
- category_owners_table.truncate
92
88
  category_owners_table.push(category_owners.as_db_table)
93
89
 
94
- puts "Successfully synced #{category_owners.as_db_table.length} feature categories to ClickHouse"
90
+ puts "Successfully synced feature category ownership data to ClickHouse"
95
91
  else
96
92
  puts "Missing argument(s). Required arguments are: #{required_params.map { |p| "--#{p.to_s.tr('_', '-')}" }.join(', ')}"
97
93
  puts options
data/exe/test-coverage CHANGED
@@ -22,7 +22,7 @@ require_relative '../lib/gitlab_quality/test_tooling/code_coverage/responsibilit
22
22
  require_relative '../lib/gitlab_quality/test_tooling/code_coverage/responsibility_patterns_config'
23
23
 
24
24
  params = {}
25
- required_params = [:test_reports, :coverage_report, :test_map, :clickhouse_url, :clickhouse_database, :clickhouse_username, :responsibility_patterns]
25
+ required_params = [:test_reports, :coverage_report, :test_map, :clickhouse_url, :clickhouse_database, :clickhouse_username, :clickhouse_shared_database, :responsibility_patterns]
26
26
 
27
27
  options = OptionParser.new do |opts|
28
28
  opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
@@ -55,7 +55,7 @@ options = OptionParser.new do |opts|
55
55
  params[:clickhouse_username] = username
56
56
  end
57
57
 
58
- opts.on('--clickhouse-shared-database DATABASE', 'ClickHouse shared database name (default: shared)') do |database|
58
+ opts.on('--clickhouse-shared-database DATABASE', 'ClickHouse shared database name') do |database|
59
59
  params[:clickhouse_shared_database] = database
60
60
  end
61
61
 
@@ -91,7 +91,7 @@ if params.any? && (required_params - params.keys).none?
91
91
  exit 1
92
92
  end
93
93
 
94
- [:clickhouse_url, :clickhouse_database, :clickhouse_username].each do |param|
94
+ [:clickhouse_url, :clickhouse_database, :clickhouse_username, :clickhouse_shared_database].each do |param|
95
95
  if params[param].to_s.strip.empty?
96
96
  puts "Error: --#{param.to_s.tr('_', '-')} cannot be empty"
97
97
  exit 1
@@ -172,7 +172,7 @@ if params.any? && (required_params - params.keys).none?
172
172
 
173
173
  shared_clickhouse_data = {
174
174
  url: params[:clickhouse_url],
175
- database: params[:clickhouse_shared_database] || 'shared',
175
+ database: params[:clickhouse_shared_database],
176
176
  username: params[:clickhouse_username],
177
177
  password: clickhouse_password
178
178
  }
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'active_support/core_ext/object/deep_dup'
4
+
3
5
  require_relative 'table'
4
6
 
5
7
  module GitlabQuality
@@ -13,12 +15,33 @@ module GitlabQuality
13
15
 
14
16
  KNOWN_UNOWNED = %w[shared not_owned tooling].freeze
15
17
 
16
- def truncate
17
- logger.debug("#{LOG_PREFIX} Truncating table #{full_table_name} ...")
18
+ # SQL query to get the latest ownership record for each unique category+ownership combination
19
+ # Partitions by the full composite key to handle cases where a category has multiple ownerships
20
+ LATEST_RECORDS_QUERY = <<~SQL
21
+ SELECT category, group, stage, section
22
+ FROM (
23
+ SELECT category, group, stage, section,
24
+ ROW_NUMBER() OVER (PARTITION BY category, group, stage, section ORDER BY timestamp DESC) as rn
25
+ FROM %{table_name}
26
+ )
27
+ WHERE rn = 1
28
+ SQL
29
+
30
+ # Insert only new category ownership records that don't already exist
31
+ # This avoids needing TRUNCATE permission
32
+ def push(data)
33
+ return logger.warn("#{LOG_PREFIX} No data found, skipping insert!") if data.empty?
18
34
 
19
- client.query("TRUNCATE TABLE #{full_table_name}")
35
+ sanitized_data = sanitize_and_filter_data(data)
36
+ return if sanitized_data.empty?
20
37
 
21
- logger.info("#{LOG_PREFIX} Successfully truncated table #{full_table_name}")
38
+ new_records = filter_new_records(sanitized_data)
39
+ return if new_records.empty?
40
+
41
+ insert_new_records(new_records, sanitized_data.size)
42
+ rescue StandardError => e
43
+ logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
44
+ raise
22
45
  end
23
46
 
24
47
  # Owners of particular feature category as group, stage and section
@@ -38,17 +61,84 @@ module GitlabQuality
38
61
  raise(MissingMappingError, "Feature category '#{feature_category_name}' not found in table '#{table_name}'")
39
62
  end
40
63
 
64
+ # Raw category owner data
65
+ #
66
+ # @return [Hash]
67
+ def owner_records
68
+ records.deep_dup
69
+ end
70
+
41
71
  private
42
72
 
43
73
  def records
44
- @records ||= client
45
- .query("SELECT category, group, stage, section FROM #{table_name}")
46
- .each_with_object({}) { |record, hsh| hsh[record["category"]] = record.slice("group", "stage", "section") }
74
+ @records ||= fetch_latest_records.each_with_object({}) do |record, hsh|
75
+ hsh[record["category"]] = record.slice("group", "stage", "section")
76
+ end
77
+ end
78
+
79
+ def sanitize_and_filter_data(data)
80
+ logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
81
+ sanitized_data = sanitize(data)
82
+
83
+ logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?
84
+
85
+ sanitized_data
86
+ end
87
+
88
+ def filter_new_records(sanitized_data)
89
+ existing_records = fetch_existing_records
90
+ # Deduplicate against latest records per category to prevent inserting duplicate historical records.
91
+ # This ensures we only insert records with new category+ownership combinations, even if an older
92
+ # version of the same category+ownership existed previously.
93
+ new_records = sanitized_data.reject { |record| existing_records.include?(record_key(record)) }
94
+
95
+ logger.info("#{LOG_PREFIX} No new records to insert, all data already exists") if new_records.empty?
96
+
97
+ new_records
98
+ end
99
+
100
+ def insert_new_records(new_records, total_sanitized_count)
101
+ client.insert_json_data(table_name, new_records)
102
+ new_count = new_records.size
103
+ existing_count = total_sanitized_count - new_count
104
+ record_word = new_count == 1 ? 'record' : 'records'
105
+ logger.info("#{LOG_PREFIX} Inserted #{new_count} new #{record_word} (#{existing_count} already existed)")
106
+ end
107
+
108
+ def fetch_existing_records
109
+ fetch_latest_records.to_set { |record| record_key(record) }
110
+ end
111
+
112
+ def fetch_latest_records
113
+ query = format(LATEST_RECORDS_QUERY, table_name: table_name)
114
+ client.query(query)
115
+ end
116
+
117
+ def sanitized_data_record(record)
118
+ {
119
+ timestamp: time,
120
+ category: record[:feature_category],
121
+ group: record[:group],
122
+ stage: record[:stage],
123
+ section: record[:section]
124
+ }
125
+ end
126
+
127
+ def record_key(record)
128
+ # Create a unique key for the combination of category + ownership
129
+ # Normalize to string keys for consistent access
130
+ normalized = record.transform_keys(&:to_s)
131
+ [
132
+ normalized["category"],
133
+ normalized["group"],
134
+ normalized["stage"],
135
+ normalized["section"]
136
+ ]
47
137
  end
48
138
 
49
139
  # @return [Boolean] True if the record is valid, false otherwise
50
140
  def valid_record?(record)
51
- required_fields = %i[category group stage section]
141
+ required_fields = %i[feature_category group stage section]
52
142
 
53
143
  required_fields.each do |field|
54
144
  if record[field].nil?
@@ -9,6 +9,36 @@ module GitlabQuality
9
9
  class TestFileMappingsTable < GitlabQuality::TestTooling::CodeCoverage::ClickHouse::Table
10
10
  TABLE_NAME = "test_file_mappings"
11
11
 
12
+ # Override push to filter out duplicate mappings before inserting
13
+ # This prevents accumulating 36M duplicate rows per day in ReplacingMergeTree
14
+ #
15
+ # @param data [Array<Hash>] Code coverage related data to be pushed to ClickHouse
16
+ # @return [nil]
17
+ def push(data) # rubocop:disable Metrics/AbcSize
18
+ return logger.warn("#{LOG_PREFIX} No data found, skipping ClickHouse export!") if data.empty?
19
+
20
+ logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
21
+ sanitized_data = sanitize(data)
22
+
23
+ return logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?
24
+
25
+ # Filter out records that already exist with identical values
26
+ new_or_changed_records = filter_duplicates(sanitized_data)
27
+
28
+ if new_or_changed_records.empty?
29
+ logger.info("#{LOG_PREFIX} All #{sanitized_data.size} mappings already exist with same values, skipping insert to #{full_table_name}")
30
+ return
31
+ end
32
+
33
+ client.insert_json_data(table_name, new_or_changed_records)
34
+ skipped_count = sanitized_data.size - new_or_changed_records.size
35
+ logger.info("#{LOG_PREFIX} Successfully pushed #{new_or_changed_records.size} new/changed records " \
36
+ "to #{full_table_name} (skipped #{skipped_count} duplicates)")
37
+ rescue StandardError => e
38
+ logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
39
+ raise
40
+ end
41
+
12
42
  private
13
43
 
14
44
  # @return [Boolean] True if the record is valid, false otherwise
@@ -38,6 +68,9 @@ module GitlabQuality
38
68
  timestamp: time,
39
69
  test_file: record[:test_file],
40
70
  source_file: record[:source_file],
71
+ # CI_PROJECT_PATH is set by GitLab CI runner and considered trusted infrastructure input.
72
+ # In non-CI environments, this may be user-controlled, but sanitize_for_clickhouse
73
+ # provides SQL injection protection.
41
74
  ci_project_path: ENV.fetch('CI_PROJECT_PATH', nil),
42
75
  category: record[:category] || '',
43
76
  group: record[:group] || '',
@@ -45,6 +78,115 @@ module GitlabQuality
45
78
  section: record[:section] || ''
46
79
  }
47
80
  end
81
+
82
+ # Filter out mappings that already exist in ClickHouse with identical enriched column values
83
+ # Only returns records that are:
84
+ # - New mappings (test_file + source_file combination doesn't exist), OR
85
+ # - Changed mappings (enriched columns category/group/stage/section differ from existing)
86
+ #
87
+ # This prevents inserting 36M duplicate rows per day while still allowing updates when
88
+ # ownership (category/group/stage/section) changes over time.
89
+ #
90
+ # @param records [Array<Hash>] Sanitized records to insert
91
+ # @return [Array<Hash>] Records that are new or have changed values
92
+ def filter_duplicates(records) # rubocop:disable Metrics/AbcSize
93
+ return records if records.empty?
94
+
95
+ # Assumes all records belong to the same project (batch from single pipeline run)
96
+ ci_project_path = records.first[:ci_project_path]
97
+ return records if ci_project_path.blank? # Can't query without project path
98
+
99
+ logger.debug("#{LOG_PREFIX} Checking for duplicate mappings in #{full_table_name}")
100
+
101
+ # Build a hash of existing mappings with their current enriched column values
102
+ # Key: "test_file|source_file", Value: {category, group, stage, section}
103
+ existing_mappings = fetch_existing_mappings(ci_project_path, records)
104
+
105
+ # Filter records: keep only new or changed mappings
106
+ new_or_changed = records.select do |record|
107
+ key = "#{record[:test_file]}|#{record[:source_file]}"
108
+ existing = existing_mappings[key]
109
+
110
+ # New mapping - doesn't exist yet
111
+ next true if existing.nil?
112
+
113
+ # Existing mapping - check if enriched columns changed
114
+ existing[:category] != record[:category] ||
115
+ existing[:group] != record[:group] ||
116
+ existing[:stage] != record[:stage] ||
117
+ existing[:section] != record[:section]
118
+ end
119
+
120
+ logger.debug("#{LOG_PREFIX} Found #{new_or_changed.size} new/changed mappings out of #{records.size} total")
121
+ new_or_changed
122
+ rescue StandardError => e
123
+ # If duplicate detection fails, fall back to inserting all records (safer than failing completely)
124
+ logger.warn("#{LOG_PREFIX} Failed to check for duplicates: #{e.message}. Inserting all records.")
125
+ records
126
+ end
127
+
128
+ # Fetch existing mappings from ClickHouse for the given project
129
+ # Returns a hash mapping "test_file|source_file" => {category, group, stage, section}
130
+ #
131
+ # @param ci_project_path [String] The CI project path to filter by
132
+ # @param records [Array<Hash>] Records being inserted (used to limit query scope)
133
+ # @return [Hash] Existing mappings with their enriched column values
134
+ def fetch_existing_mappings(ci_project_path, records) # rubocop:disable Metrics/AbcSize
135
+ # Query for ALL latest mappings for this project
136
+ # We filter in Ruby rather than building dynamic SQL to avoid injection risks
137
+ sql = <<~SQL
138
+ SELECT
139
+ test_file,
140
+ source_file,
141
+ category,
142
+ `group`,
143
+ stage,
144
+ section
145
+ FROM #{full_table_name}
146
+ WHERE ci_project_path = {project_path:String}
147
+ AND (ci_project_path, test_file, source_file, timestamp) IN (
148
+ SELECT
149
+ ci_project_path,
150
+ test_file,
151
+ source_file,
152
+ MAX(timestamp) AS timestamp
153
+ FROM #{full_table_name}
154
+ WHERE ci_project_path = {project_path:String}
155
+ GROUP BY ci_project_path, test_file, source_file
156
+ )
157
+ SQL
158
+
159
+ # Substitute project_path value using manual escaping (client doesn't support parameterized queries)
160
+ sql_with_params = sql.strip.gsub('{project_path:String}', "'#{sanitize_for_clickhouse(ci_project_path)}'")
161
+ results = client.query(sql_with_params)
162
+
163
+ # Build lookup hash, filtering to only the test_files we're checking if dataset is small
164
+ test_files_set = records.to_set { |r| r[:test_file] } if records.size <= 10_000
165
+
166
+ results.each_with_object({}) do |row, hash|
167
+ # Skip if we're doing selective filtering and this test_file isn't in our set
168
+ next if test_files_set && !test_files_set.include?(row['test_file'])
169
+
170
+ key = "#{row['test_file']}|#{row['source_file']}"
171
+ hash[key] = {
172
+ category: row['category'],
173
+ group: row['group'],
174
+ stage: row['stage'],
175
+ section: row['section']
176
+ }
177
+ end
178
+ rescue StandardError => e
179
+ logger.warn("#{LOG_PREFIX} Failed to fetch existing mappings: #{e.message}")
180
+ {} # Return empty hash on error (will treat all records as new)
181
+ end
182
+
183
+ # Sanitize string for ClickHouse by escaping single quotes and backslashes
184
+ # This protects against SQL injection when we must use string interpolation
185
+ # @param str [String] String to sanitize
186
+ # @return [String] Sanitized string
187
+ def sanitize_for_clickhouse(str)
188
+ str.to_s.gsub(/\\/, '\\\\\\\\').gsub("'", "''") # rubocop:disable Style/RedundantRegexpArgument
189
+ end
48
190
  end
49
191
  end
50
192
  end
@@ -671,7 +671,7 @@ module GitlabQuality
671
671
  # @return [String] the reason to ignore the failures, or `nil` if any failures should not be ignored.
672
672
  def ignored_failure_reason(failures, failure_to_ignore)
673
673
  failures_to_ignore = compute_ignored_failures(failures, failure_to_ignore)
674
- return if failures_to_ignore.empty?
674
+ return if failures_to_ignore.empty? || failures_to_ignore.size < failures.size
675
675
 
676
676
  "the errors included: #{failures_to_ignore.map { |e| "`#{e}`" }.join(', ')}"
677
677
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module GitlabQuality
4
4
  module TestTooling
5
- VERSION = "3.6.1"
5
+ VERSION = "3.8.0"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab_quality-test_tooling
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.6.1
4
+ version: 3.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - GitLab Quality
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-03 00:00:00.000000000 Z
11
+ date: 2026-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: climate_control