gitlab_quality-test_tooling 3.7.0 → 3.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3cb2e6580b8b15e78116b52ffd4fed91e0ab7db14ce914eb90f34ace0ca84741
4
- data.tar.gz: 14c55d5e6048c27a891dded80da8d23c3c3b71df15b04c8ff1c98404643bb19e
3
+ metadata.gz: 0eb24e5b8b53d2478c3994cd26ca10425b01e0d7d31b51e125fedeb799d2b420
4
+ data.tar.gz: 374324661a08dafc46f7b92cd63136878bbeca121f5c4293bec7b4bb22ef7b1e
5
5
  SHA512:
6
- metadata.gz: 607a792c6df20d566ab3ec3dba96b80ed8ae8c08c6c6fe30ab9cbc39ccc2201c0934b86641ff137a2b9726d2b3386ef0ed22cb76f160faeacd74aef91c4007a8
7
- data.tar.gz: 39d357544ea50bbc8968a224b699110f6957774140e7c8074fa9f28a5cd1cf03b1cdb6f0f0e22fd9a2b97ad06b0d54d183d06db043ff52a048a30758e928294f
6
+ metadata.gz: 5803a5c51bdb252e3623379ef3fe0d791c344463ded48a47e568381a5014282f56d62e328b85cf7746940736130223585ced38a6a68747a47c83ae5fef7265df
7
+ data.tar.gz: 63df90311f5de1f0906f46402299e156c596eea5f78bc466c258cd9bda8aff961b01fb73f415f9d5ea56320ae950759659e98d46fb8cdd1b8d51c71b44d51478
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gitlab_quality-test_tooling (3.7.0)
4
+ gitlab_quality-test_tooling (3.7.1)
5
5
  activesupport (>= 7.0)
6
6
  amatch (~> 0.4.1)
7
7
  fog-google (~> 1.24, >= 1.24.1)
@@ -13,13 +13,13 @@ module GitlabQuality
13
13
 
14
14
  KNOWN_UNOWNED = %w[shared not_owned tooling].freeze
15
15
 
16
- # SQL query to get the latest ownership record for each category
17
- # Uses window function to avoid loading entire table history
16
+ # SQL query to get the latest ownership record for each unique category+ownership combination
17
+ # Partitions by the full composite key to handle cases where a category has multiple ownerships
18
18
  LATEST_RECORDS_QUERY = <<~SQL
19
19
  SELECT category, group, stage, section
20
20
  FROM (
21
21
  SELECT category, group, stage, section,
22
- ROW_NUMBER() OVER (PARTITION BY category ORDER BY timestamp DESC) as rn
22
+ ROW_NUMBER() OVER (PARTITION BY category, group, stage, section ORDER BY timestamp DESC) as rn
23
23
  FROM %{table_name}
24
24
  )
25
25
  WHERE rn = 1
@@ -9,6 +9,36 @@ module GitlabQuality
9
9
  class TestFileMappingsTable < GitlabQuality::TestTooling::CodeCoverage::ClickHouse::Table
10
10
  TABLE_NAME = "test_file_mappings"
11
11
 
12
+ # Override push to filter out duplicate mappings before inserting
13
+ # This prevents accumulating 36M duplicate rows per day in ReplacingMergeTree
14
+ #
15
+ # @param data [Array<Hash>] Code coverage related data to be pushed to ClickHouse
16
+ # @return [nil]
17
+ def push(data) # rubocop:disable Metrics/AbcSize
18
+ return logger.warn("#{LOG_PREFIX} No data found, skipping ClickHouse export!") if data.empty?
19
+
20
+ logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
21
+ sanitized_data = sanitize(data)
22
+
23
+ return logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?
24
+
25
+ # Filter out records that already exist with identical values
26
+ new_or_changed_records = filter_duplicates(sanitized_data)
27
+
28
+ if new_or_changed_records.empty?
29
+ logger.info("#{LOG_PREFIX} All #{sanitized_data.size} mappings already exist with same values, skipping insert to #{full_table_name}")
30
+ return
31
+ end
32
+
33
+ client.insert_json_data(table_name, new_or_changed_records)
34
+ skipped_count = sanitized_data.size - new_or_changed_records.size
35
+ logger.info("#{LOG_PREFIX} Successfully pushed #{new_or_changed_records.size} new/changed records " \
36
+ "to #{full_table_name} (skipped #{skipped_count} duplicates)")
37
+ rescue StandardError => e
38
+ logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
39
+ raise
40
+ end
41
+
12
42
  private
13
43
 
14
44
  # @return [Boolean] True if the record is valid, false otherwise
@@ -38,6 +68,9 @@ module GitlabQuality
38
68
  timestamp: time,
39
69
  test_file: record[:test_file],
40
70
  source_file: record[:source_file],
71
+ # CI_PROJECT_PATH is set by GitLab CI runner and considered trusted infrastructure input.
72
+ # In non-CI environments, this may be user-controlled, but sanitize_for_clickhouse
73
+ # provides SQL injection protection.
41
74
  ci_project_path: ENV.fetch('CI_PROJECT_PATH', nil),
42
75
  category: record[:category] || '',
43
76
  group: record[:group] || '',
@@ -45,6 +78,115 @@ module GitlabQuality
45
78
  section: record[:section] || ''
46
79
  }
47
80
  end
81
+
82
+ # Filter out mappings that already exist in ClickHouse with identical enriched column values
83
+ # Only returns records that are:
84
+ # - New mappings (test_file + source_file combination doesn't exist), OR
85
+ # - Changed mappings (enriched columns category/group/stage/section differ from existing)
86
+ #
87
+ # This prevents inserting 36M duplicate rows per day while still allowing updates when
88
+ # ownership (category/group/stage/section) changes over time.
89
+ #
90
+ # @param records [Array<Hash>] Sanitized records to insert
91
+ # @return [Array<Hash>] Records that are new or have changed values
92
+ def filter_duplicates(records) # rubocop:disable Metrics/AbcSize
93
+ return records if records.empty?
94
+
95
+ # Assumes all records belong to the same project (batch from single pipeline run)
96
+ ci_project_path = records.first[:ci_project_path]
97
+ return records if ci_project_path.blank? # Can't query without project path
98
+
99
+ logger.debug("#{LOG_PREFIX} Checking for duplicate mappings in #{full_table_name}")
100
+
101
+ # Build a hash of existing mappings with their current enriched column values
102
+ # Key: "test_file|source_file", Value: {category, group, stage, section}
103
+ existing_mappings = fetch_existing_mappings(ci_project_path, records)
104
+
105
+ # Filter records: keep only new or changed mappings
106
+ new_or_changed = records.select do |record|
107
+ key = "#{record[:test_file]}|#{record[:source_file]}"
108
+ existing = existing_mappings[key]
109
+
110
+ # New mapping - doesn't exist yet
111
+ next true if existing.nil?
112
+
113
+ # Existing mapping - check if enriched columns changed
114
+ existing[:category] != record[:category] ||
115
+ existing[:group] != record[:group] ||
116
+ existing[:stage] != record[:stage] ||
117
+ existing[:section] != record[:section]
118
+ end
119
+
120
+ logger.debug("#{LOG_PREFIX} Found #{new_or_changed.size} new/changed mappings out of #{records.size} total")
121
+ new_or_changed
122
+ rescue StandardError => e
123
+ # If duplicate detection fails, fall back to inserting all records (safer than failing completely)
124
+ logger.warn("#{LOG_PREFIX} Failed to check for duplicates: #{e.message}. Inserting all records.")
125
+ records
126
+ end
127
+
128
+ # Fetch existing mappings from ClickHouse for the given project
129
+ # Returns a hash mapping "test_file|source_file" => {category, group, stage, section}
130
+ #
131
+ # @param ci_project_path [String] The CI project path to filter by
132
+ # @param records [Array<Hash>] Records being inserted (used to limit query scope)
133
+ # @return [Hash] Existing mappings with their enriched column values
134
+ def fetch_existing_mappings(ci_project_path, records) # rubocop:disable Metrics/AbcSize
135
+ # Query for ALL latest mappings for this project
136
+ # We filter in Ruby rather than building dynamic SQL to avoid injection risks
137
+ sql = <<~SQL
138
+ SELECT
139
+ test_file,
140
+ source_file,
141
+ category,
142
+ `group`,
143
+ stage,
144
+ section
145
+ FROM #{full_table_name}
146
+ WHERE ci_project_path = {project_path:String}
147
+ AND (ci_project_path, test_file, source_file, timestamp) IN (
148
+ SELECT
149
+ ci_project_path,
150
+ test_file,
151
+ source_file,
152
+ MAX(timestamp) AS timestamp
153
+ FROM #{full_table_name}
154
+ WHERE ci_project_path = {project_path:String}
155
+ GROUP BY ci_project_path, test_file, source_file
156
+ )
157
+ SQL
158
+
159
+ # Substitute project_path value using manual escaping (client doesn't support parameterized queries)
160
+ sql_with_params = sql.strip.gsub('{project_path:String}', "'#{sanitize_for_clickhouse(ci_project_path)}'")
161
+ results = client.query(sql_with_params)
162
+
163
+ # Build lookup hash, filtering to only the test_files we're checking if dataset is small
164
+ test_files_set = records.to_set { |r| r[:test_file] } if records.size <= 10_000
165
+
166
+ results.each_with_object({}) do |row, hash|
167
+ # Skip if we're doing selective filtering and this test_file isn't in our set
168
+ next if test_files_set && !test_files_set.include?(row['test_file'])
169
+
170
+ key = "#{row['test_file']}|#{row['source_file']}"
171
+ hash[key] = {
172
+ category: row['category'],
173
+ group: row['group'],
174
+ stage: row['stage'],
175
+ section: row['section']
176
+ }
177
+ end
178
+ rescue StandardError => e
179
+ logger.warn("#{LOG_PREFIX} Failed to fetch existing mappings: #{e.message}")
180
+ {} # Return empty hash on error (will treat all records as new)
181
+ end
182
+
183
+ # Sanitize string for ClickHouse by escaping single quotes and backslashes
184
+ # This protects against SQL injection when we must use string interpolation
185
+ # @param str [String] String to sanitize
186
+ # @return [String] Sanitized string
187
+ def sanitize_for_clickhouse(str)
188
+ str.to_s.gsub(/\\/, '\\\\\\\\').gsub("'", "''") # rubocop:disable Style/RedundantRegexpArgument
189
+ end
48
190
  end
49
191
  end
50
192
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module GitlabQuality
4
4
  module TestTooling
5
- VERSION = "3.7.0"
5
+ VERSION = "3.7.1"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab_quality-test_tooling
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.7.0
4
+ version: 3.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - GitLab Quality
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-04 00:00:00.000000000 Z
11
+ date: 2026-02-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: climate_control