gitlab_quality-test_tooling 3.6.1 → 3.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6922c9895f727e5c4ad2c7fd9ecc7ffa2c17f0de9125ae24def5627dc040d2f9
4
- data.tar.gz: 89a32f162c38d2098ffad75338adfb3ef3d90b773819ba06b03d03db111cd252
3
+ metadata.gz: 0eb24e5b8b53d2478c3994cd26ca10425b01e0d7d31b51e125fedeb799d2b420
4
+ data.tar.gz: 374324661a08dafc46f7b92cd63136878bbeca121f5c4293bec7b4bb22ef7b1e
5
5
  SHA512:
6
- metadata.gz: 1c216c46fe2a705acba58bee70576cc7e4a71773802bc58ac59f60ef3ca413186f4a3bf5255712986b8943ffe7248797ce1e9d35a078176fa76cad458ea95519
7
- data.tar.gz: 69c7c48e2270d86d89b6c12d5d11800079fe20af2c6024f65b8d2d35f89499a63816cb188a3c086520f571a9288cd51330849b115554904c7e802d9e2d2fff42
6
+ metadata.gz: 5803a5c51bdb252e3623379ef3fe0d791c344463ded48a47e568381a5014282f56d62e328b85cf7746940736130223585ced38a6a68747a47c83ae5fef7265df
7
+ data.tar.gz: 63df90311f5de1f0906f46402299e156c596eea5f78bc466c258cd9bda8aff961b01fb73f415f9d5ea56320ae950759659e98d46fb8cdd1b8d51c71b44d51478
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gitlab_quality-test_tooling (3.6.1)
4
+ gitlab_quality-test_tooling (3.7.1)
5
5
  activesupport (>= 7.0)
6
6
  amatch (~> 0.4.1)
7
7
  fog-google (~> 1.24, >= 1.24.1)
data/README.md CHANGED
@@ -292,7 +292,7 @@ Options:
292
292
  --clickhouse-username USERNAME
293
293
  ClickHouse username
294
294
  --clickhouse-shared-database DATABASE
295
- ClickHouse shared database name (default: shared)
295
+ ClickHouse shared database name
296
296
  --responsibility-patterns PATH
297
297
  Path to YAML file with responsibility classification patterns
298
298
 
@@ -312,7 +312,7 @@ Usage: exe/sync-category-owners [options]
312
312
  Options:
313
313
  --clickhouse-url URL ClickHouse server URL
314
314
  --clickhouse-database DATABASE
315
- ClickHouse database name (default: shared)
315
+ ClickHouse database name
316
316
  --clickhouse-username USERNAME
317
317
  ClickHouse username
318
318
 
@@ -24,7 +24,7 @@ options = OptionParser.new do |opts|
24
24
  params[:clickhouse_url] = url
25
25
  end
26
26
 
27
- opts.on('--clickhouse-database DATABASE', 'ClickHouse database name (default: shared)') do |database|
27
+ opts.on('--clickhouse-database DATABASE', 'ClickHouse database name') do |database|
28
28
  params[:clickhouse_database] = database
29
29
  end
30
30
 
@@ -51,9 +51,6 @@ options = OptionParser.new do |opts|
51
51
  opts.parse(ARGV)
52
52
  end
53
53
 
54
- # Default database to 'shared' if not specified
55
- params[:clickhouse_database] ||= 'shared'
56
-
57
54
  if params.any? && (required_params - params.keys).none?
58
55
  clickhouse_password = ENV.fetch('GLCI_CLICKHOUSE_METRICS_PASSWORD', nil)
59
56
  if clickhouse_password.to_s.strip.empty?
@@ -88,10 +85,9 @@ if params.any? && (required_params - params.keys).none?
88
85
  password: clickhouse_password
89
86
  )
90
87
 
91
- category_owners_table.truncate
92
88
  category_owners_table.push(category_owners.as_db_table)
93
89
 
94
- puts "Successfully synced #{category_owners.as_db_table.length} feature categories to ClickHouse"
90
+ puts "Successfully synced feature category ownership data to ClickHouse"
95
91
  else
96
92
  puts "Missing argument(s). Required arguments are: #{required_params.map { |p| "--#{p.to_s.tr('_', '-')}" }.join(', ')}"
97
93
  puts options
data/exe/test-coverage CHANGED
@@ -22,7 +22,7 @@ require_relative '../lib/gitlab_quality/test_tooling/code_coverage/responsibilit
22
22
  require_relative '../lib/gitlab_quality/test_tooling/code_coverage/responsibility_patterns_config'
23
23
 
24
24
  params = {}
25
- required_params = [:test_reports, :coverage_report, :test_map, :clickhouse_url, :clickhouse_database, :clickhouse_username, :responsibility_patterns]
25
+ required_params = [:test_reports, :coverage_report, :test_map, :clickhouse_url, :clickhouse_database, :clickhouse_username, :clickhouse_shared_database, :responsibility_patterns]
26
26
 
27
27
  options = OptionParser.new do |opts|
28
28
  opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
@@ -55,7 +55,7 @@ options = OptionParser.new do |opts|
55
55
  params[:clickhouse_username] = username
56
56
  end
57
57
 
58
- opts.on('--clickhouse-shared-database DATABASE', 'ClickHouse shared database name (default: shared)') do |database|
58
+ opts.on('--clickhouse-shared-database DATABASE', 'ClickHouse shared database name') do |database|
59
59
  params[:clickhouse_shared_database] = database
60
60
  end
61
61
 
@@ -91,7 +91,7 @@ if params.any? && (required_params - params.keys).none?
91
91
  exit 1
92
92
  end
93
93
 
94
- [:clickhouse_url, :clickhouse_database, :clickhouse_username].each do |param|
94
+ [:clickhouse_url, :clickhouse_database, :clickhouse_username, :clickhouse_shared_database].each do |param|
95
95
  if params[param].to_s.strip.empty?
96
96
  puts "Error: --#{param.to_s.tr('_', '-')} cannot be empty"
97
97
  exit 1
@@ -172,7 +172,7 @@ if params.any? && (required_params - params.keys).none?
172
172
 
173
173
  shared_clickhouse_data = {
174
174
  url: params[:clickhouse_url],
175
- database: params[:clickhouse_shared_database] || 'shared',
175
+ database: params[:clickhouse_shared_database],
176
176
  username: params[:clickhouse_username],
177
177
  password: clickhouse_password
178
178
  }
@@ -13,12 +13,33 @@ module GitlabQuality
13
13
 
14
14
  KNOWN_UNOWNED = %w[shared not_owned tooling].freeze
15
15
 
16
- def truncate
17
- logger.debug("#{LOG_PREFIX} Truncating table #{full_table_name} ...")
16
+ # SQL query to get the latest ownership record for each unique category+ownership combination
17
+ # Partitions by the full composite key to handle cases where a category has multiple ownerships
18
+ LATEST_RECORDS_QUERY = <<~SQL
19
+ SELECT category, group, stage, section
20
+ FROM (
21
+ SELECT category, group, stage, section,
22
+ ROW_NUMBER() OVER (PARTITION BY category, group, stage, section ORDER BY timestamp DESC) as rn
23
+ FROM %{table_name}
24
+ )
25
+ WHERE rn = 1
26
+ SQL
18
27
 
19
- client.query("TRUNCATE TABLE #{full_table_name}")
28
+ # Insert only new category ownership records that don't already exist
29
+ # This avoids needing TRUNCATE permission
30
+ def push(data)
31
+ return logger.warn("#{LOG_PREFIX} No data found, skipping insert!") if data.empty?
20
32
 
21
- logger.info("#{LOG_PREFIX} Successfully truncated table #{full_table_name}")
33
+ sanitized_data = sanitize_and_filter_data(data)
34
+ return if sanitized_data.empty?
35
+
36
+ new_records = filter_new_records(sanitized_data)
37
+ return if new_records.empty?
38
+
39
+ insert_new_records(new_records, sanitized_data.size)
40
+ rescue StandardError => e
41
+ logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
42
+ raise
22
43
  end
23
44
 
24
45
  # Owners of particular feature category as group, stage and section
@@ -41,14 +62,74 @@ module GitlabQuality
41
62
  private
42
63
 
43
64
  def records
44
- @records ||= client
45
- .query("SELECT category, group, stage, section FROM #{table_name}")
46
- .each_with_object({}) { |record, hsh| hsh[record["category"]] = record.slice("group", "stage", "section") }
65
+ @records ||= fetch_latest_records.each_with_object({}) do |record, hsh|
66
+ hsh[record["category"]] = record.slice("group", "stage", "section")
67
+ end
68
+ end
69
+
70
+ def sanitize_and_filter_data(data)
71
+ logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
72
+ sanitized_data = sanitize(data)
73
+
74
+ logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?
75
+
76
+ sanitized_data
77
+ end
78
+
79
+ def filter_new_records(sanitized_data)
80
+ existing_records = fetch_existing_records
81
+ # Deduplicate against latest records per category to prevent inserting duplicate historical records.
82
+ # This ensures we only insert records with new category+ownership combinations, even if an older
83
+ # version of the same category+ownership existed previously.
84
+ new_records = sanitized_data.reject { |record| existing_records.include?(record_key(record)) }
85
+
86
+ logger.info("#{LOG_PREFIX} No new records to insert, all data already exists") if new_records.empty?
87
+
88
+ new_records
89
+ end
90
+
91
+ def insert_new_records(new_records, total_sanitized_count)
92
+ client.insert_json_data(table_name, new_records)
93
+ new_count = new_records.size
94
+ existing_count = total_sanitized_count - new_count
95
+ record_word = new_count == 1 ? 'record' : 'records'
96
+ logger.info("#{LOG_PREFIX} Inserted #{new_count} new #{record_word} (#{existing_count} already existed)")
97
+ end
98
+
99
+ def fetch_existing_records
100
+ fetch_latest_records.to_set { |record| record_key(record) }
101
+ end
102
+
103
+ def fetch_latest_records
104
+ query = format(LATEST_RECORDS_QUERY, table_name: table_name)
105
+ client.query(query)
106
+ end
107
+
108
+ def sanitized_data_record(record)
109
+ {
110
+ timestamp: time,
111
+ category: record[:feature_category],
112
+ group: record[:group],
113
+ stage: record[:stage],
114
+ section: record[:section]
115
+ }
116
+ end
117
+
118
+ def record_key(record)
119
+ # Create a unique key for the combination of category + ownership
120
+ # Normalize to string keys for consistent access
121
+ normalized = record.transform_keys(&:to_s)
122
+ [
123
+ normalized["category"],
124
+ normalized["group"],
125
+ normalized["stage"],
126
+ normalized["section"]
127
+ ]
47
128
  end
48
129
 
49
130
  # @return [Boolean] True if the record is valid, false otherwise
50
131
  def valid_record?(record)
51
- required_fields = %i[category group stage section]
132
+ required_fields = %i[feature_category group stage section]
52
133
 
53
134
  required_fields.each do |field|
54
135
  if record[field].nil?
@@ -9,6 +9,36 @@ module GitlabQuality
9
9
  class TestFileMappingsTable < GitlabQuality::TestTooling::CodeCoverage::ClickHouse::Table
10
10
  TABLE_NAME = "test_file_mappings"
11
11
 
12
+ # Override push to filter out duplicate mappings before inserting
13
+ # This prevents accumulating 36M duplicate rows per day in ReplacingMergeTree
14
+ #
15
+ # @param data [Array<Hash>] Code coverage related data to be pushed to ClickHouse
16
+ # @return [nil]
17
+ def push(data) # rubocop:disable Metrics/AbcSize
18
+ return logger.warn("#{LOG_PREFIX} No data found, skipping ClickHouse export!") if data.empty?
19
+
20
+ logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
21
+ sanitized_data = sanitize(data)
22
+
23
+ return logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?
24
+
25
+ # Filter out records that already exist with identical values
26
+ new_or_changed_records = filter_duplicates(sanitized_data)
27
+
28
+ if new_or_changed_records.empty?
29
+ logger.info("#{LOG_PREFIX} All #{sanitized_data.size} mappings already exist with same values, skipping insert to #{full_table_name}")
30
+ return
31
+ end
32
+
33
+ client.insert_json_data(table_name, new_or_changed_records)
34
+ skipped_count = sanitized_data.size - new_or_changed_records.size
35
+ logger.info("#{LOG_PREFIX} Successfully pushed #{new_or_changed_records.size} new/changed records " \
36
+ "to #{full_table_name} (skipped #{skipped_count} duplicates)")
37
+ rescue StandardError => e
38
+ logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
39
+ raise
40
+ end
41
+
12
42
  private
13
43
 
14
44
  # @return [Boolean] True if the record is valid, false otherwise
@@ -38,6 +68,9 @@ module GitlabQuality
38
68
  timestamp: time,
39
69
  test_file: record[:test_file],
40
70
  source_file: record[:source_file],
71
+ # CI_PROJECT_PATH is set by GitLab CI runner and considered trusted infrastructure input.
72
+ # In non-CI environments, this may be user-controlled, but sanitize_for_clickhouse
73
+ # provides SQL injection protection.
41
74
  ci_project_path: ENV.fetch('CI_PROJECT_PATH', nil),
42
75
  category: record[:category] || '',
43
76
  group: record[:group] || '',
@@ -45,6 +78,115 @@ module GitlabQuality
45
78
  section: record[:section] || ''
46
79
  }
47
80
  end
81
+
82
+ # Filter out mappings that already exist in ClickHouse with identical enriched column values
83
+ # Only returns records that are:
84
+ # - New mappings (test_file + source_file combination doesn't exist), OR
85
+ # - Changed mappings (enriched columns category/group/stage/section differ from existing)
86
+ #
87
+ # This prevents inserting 36M duplicate rows per day while still allowing updates when
88
+ # ownership (category/group/stage/section) changes over time.
89
+ #
90
+ # @param records [Array<Hash>] Sanitized records to insert
91
+ # @return [Array<Hash>] Records that are new or have changed values
92
+ def filter_duplicates(records) # rubocop:disable Metrics/AbcSize
93
+ return records if records.empty?
94
+
95
+ # Assumes all records belong to the same project (batch from single pipeline run)
96
+ ci_project_path = records.first[:ci_project_path]
97
+ return records if ci_project_path.blank? # Can't query without project path
98
+
99
+ logger.debug("#{LOG_PREFIX} Checking for duplicate mappings in #{full_table_name}")
100
+
101
+ # Build a hash of existing mappings with their current enriched column values
102
+ # Key: "test_file|source_file", Value: {category, group, stage, section}
103
+ existing_mappings = fetch_existing_mappings(ci_project_path, records)
104
+
105
+ # Filter records: keep only new or changed mappings
106
+ new_or_changed = records.select do |record|
107
+ key = "#{record[:test_file]}|#{record[:source_file]}"
108
+ existing = existing_mappings[key]
109
+
110
+ # New mapping - doesn't exist yet
111
+ next true if existing.nil?
112
+
113
+ # Existing mapping - check if enriched columns changed
114
+ existing[:category] != record[:category] ||
115
+ existing[:group] != record[:group] ||
116
+ existing[:stage] != record[:stage] ||
117
+ existing[:section] != record[:section]
118
+ end
119
+
120
+ logger.debug("#{LOG_PREFIX} Found #{new_or_changed.size} new/changed mappings out of #{records.size} total")
121
+ new_or_changed
122
+ rescue StandardError => e
123
+ # If duplicate detection fails, fall back to inserting all records (safer than failing completely)
124
+ logger.warn("#{LOG_PREFIX} Failed to check for duplicates: #{e.message}. Inserting all records.")
125
+ records
126
+ end
127
+
128
+ # Fetch existing mappings from ClickHouse for the given project
129
+ # Returns a hash mapping "test_file|source_file" => {category, group, stage, section}
130
+ #
131
+ # @param ci_project_path [String] The CI project path to filter by
132
+ # @param records [Array<Hash>] Records being inserted (used to limit query scope)
133
+ # @return [Hash] Existing mappings with their enriched column values
134
+ def fetch_existing_mappings(ci_project_path, records) # rubocop:disable Metrics/AbcSize
135
+ # Query for ALL latest mappings for this project
136
+ # We filter in Ruby rather than building dynamic SQL to avoid injection risks
137
+ sql = <<~SQL
138
+ SELECT
139
+ test_file,
140
+ source_file,
141
+ category,
142
+ `group`,
143
+ stage,
144
+ section
145
+ FROM #{full_table_name}
146
+ WHERE ci_project_path = {project_path:String}
147
+ AND (ci_project_path, test_file, source_file, timestamp) IN (
148
+ SELECT
149
+ ci_project_path,
150
+ test_file,
151
+ source_file,
152
+ MAX(timestamp) AS timestamp
153
+ FROM #{full_table_name}
154
+ WHERE ci_project_path = {project_path:String}
155
+ GROUP BY ci_project_path, test_file, source_file
156
+ )
157
+ SQL
158
+
159
+ # Substitute project_path value using manual escaping (client doesn't support parameterized queries)
160
+ sql_with_params = sql.strip.gsub('{project_path:String}', "'#{sanitize_for_clickhouse(ci_project_path)}'")
161
+ results = client.query(sql_with_params)
162
+
163
+ # Build lookup hash, filtering to only the test_files we're checking if dataset is small
164
+ test_files_set = records.to_set { |r| r[:test_file] } if records.size <= 10_000
165
+
166
+ results.each_with_object({}) do |row, hash|
167
+ # Skip if we're doing selective filtering and this test_file isn't in our set
168
+ next if test_files_set && !test_files_set.include?(row['test_file'])
169
+
170
+ key = "#{row['test_file']}|#{row['source_file']}"
171
+ hash[key] = {
172
+ category: row['category'],
173
+ group: row['group'],
174
+ stage: row['stage'],
175
+ section: row['section']
176
+ }
177
+ end
178
+ rescue StandardError => e
179
+ logger.warn("#{LOG_PREFIX} Failed to fetch existing mappings: #{e.message}")
180
+ {} # Return empty hash on error (will treat all records as new)
181
+ end
182
+
183
+ # Sanitize string for ClickHouse by escaping single quotes and backslashes
184
+ # This protects against SQL injection when we must use string interpolation
185
+ # @param str [String] String to sanitize
186
+ # @return [String] Sanitized string
187
+ def sanitize_for_clickhouse(str)
188
+ str.to_s.gsub(/\\/, '\\\\\\\\').gsub("'", "''") # rubocop:disable Style/RedundantRegexpArgument
189
+ end
48
190
  end
49
191
  end
50
192
  end
@@ -671,7 +671,7 @@ module GitlabQuality
671
671
  # @return [String] the reason to ignore the failures, or `nil` if any failures should not be ignored.
672
672
  def ignored_failure_reason(failures, failure_to_ignore)
673
673
  failures_to_ignore = compute_ignored_failures(failures, failure_to_ignore)
674
- return if failures_to_ignore.empty?
674
+ return if failures_to_ignore.empty? || failures_to_ignore.size < failures.size
675
675
 
676
676
  "the errors included: #{failures_to_ignore.map { |e| "`#{e}`" }.join(', ')}"
677
677
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module GitlabQuality
4
4
  module TestTooling
5
- VERSION = "3.6.1"
5
+ VERSION = "3.7.1"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab_quality-test_tooling
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.6.1
4
+ version: 3.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - GitLab Quality
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-03 00:00:00.000000000 Z
11
+ date: 2026-02-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: climate_control