gitlab-secret_detection 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7ff1ed2f6fa1d52463144cfacf4a23bd191822d6660e5f4d3c15cefb349b9dd
4
- data.tar.gz: 13af55c3efd41733108968de3d0c8d03648c726947170636a293db174521a81e
3
+ metadata.gz: 648b6d5277ac8e7948762533af39dc2f4ff0ae4c62fbdcc6d5d32615a44ab815
4
+ data.tar.gz: afd2a580cb0a73bb84a401616ec622e1e14d2b75021036a959f933ed75864cf6
5
5
  SHA512:
6
- metadata.gz: 6698802604dffeff97940812c8463dbb21c73698c8f438e825b482d5dc68c97953aaa983d03af7de26f37ff9ec7d1e1418966e495c2ef8f308eb05f40d4ae601
7
- data.tar.gz: b6e1b308dddfa644500184058228faf67458702c82994fc9c475ce03ef47a51323f309c1163287a2d1ce0bbd13aaec2a19c71e4a8ca711f6a6beaefbda2649d1
6
+ metadata.gz: 7c49b71891f6e13d8dc252936f18df838d8e9d42000cbc1f4f3d7fe56b258a7e2c17bce7a7f0e771bc16991fa46bacf641a636a1d1efd3ef23de6b0a31a011b8
7
+ data.tar.gz: af93c82a0025be12bc82fe6ff62b290fe193a2338ae2f98d83bc2729d1c7cc35a223330a3a9907a9f715c009127e93ba2d13fc33982d2f13a179d8548f5cc36e
@@ -4,6 +4,7 @@ require 're2'
4
4
  require 'logger'
5
5
  require 'timeout'
6
6
  require 'English'
7
+ require 'parallel'
7
8
 
8
9
  module GitLab
9
10
  module SecretDetection
@@ -24,6 +25,14 @@ module GitLab
24
25
  DEFAULT_PAYLOAD_TIMEOUT_SECS = 30 # 30 seconds
25
26
  # Tags used for creating default pattern matcher
26
27
  DEFAULT_PATTERN_MATCHER_TAGS = ['gitlab_blocking'].freeze
28
+ # Max no of child processes to spawn per request
29
+ # ref: https://gitlab.com/gitlab-org/gitlab/-/issues/430160
30
+ MAX_PROCS_PER_REQUEST = 5
31
+ # Minimum cumulative size of the payloads required to spawn and
32
+ # run the scan within a new subprocess.
33
+ MIN_CHUNK_SIZE_PER_PROC_BYTES = 2_097_152 # 2MiB
34
+ # Whether to run scan in subprocesses or not. Default is false.
35
+ RUN_IN_SUBPROCESS = false
27
36
 
28
37
  # Initializes the instance with logger along with following operations:
29
38
  # 1. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation.
@@ -58,6 +67,13 @@ module GitLab
58
67
  # For example: Add `gitlab_blocking` to include only rules for Push Protection. Defaults to
59
68
  # [`gitlab_blocking`] (+DEFAULT_PATTERN_MATCHER_TAGS+).
60
69
  #
70
+ # NOTE:
71
+ # Running the scan in fork mode primarily focuses on reducing the memory consumption of the scan by
72
+ # offloading regex operations on large payloads to sub-processes. However, it does not assure the improvement
73
+ # in the overall latency of the scan, specifically in the case of smaller payloads, where the overhead of
74
+ # forking a new process adds to the overall latency of the scan instead. More reference on Subprocess-based
75
+ # execution is found here: https://gitlab.com/gitlab-org/gitlab/-/issues/430160.
76
+ #
61
77
  # Returns an instance of GitLab::SecretDetection::Core::Response by following below structure:
62
78
  # {
63
79
  # status: One of the Core::Status values
@@ -70,7 +86,8 @@ module GitLab
70
86
  payload_timeout: DEFAULT_PAYLOAD_TIMEOUT_SECS,
71
87
  raw_value_exclusions: [],
72
88
  rule_exclusions: [],
73
- tags: DEFAULT_PATTERN_MATCHER_TAGS
89
+ tags: DEFAULT_PATTERN_MATCHER_TAGS,
90
+ subprocess: RUN_IN_SUBPROCESS
74
91
  )
75
92
 
76
93
  return Core::Response.new(Core::Status::INPUT_ERROR) unless validate_scan_input(payloads)
@@ -87,11 +104,13 @@ module GitLab
87
104
 
88
105
  next Core::Response.new(Core::Status::NOT_FOUND) if matched_payloads.empty?
89
106
 
90
- secrets = run_scan(
107
+ scan_args = {
91
108
  payloads: matched_payloads, payload_timeout:,
92
109
  pattern_matcher: build_pattern_matcher(tags:),
93
110
  raw_value_exclusions:, rule_exclusions:
94
- )
111
+ }
112
+
113
+ secrets = subprocess ? run_scan_within_subprocess(**scan_args) : run_scan(**scan_args)
95
114
 
96
115
  scan_status = overall_scan_status(secrets)
97
116
 
@@ -205,6 +224,36 @@ module GitLab
205
224
  end
206
225
  end
207
226
 
227
+ def run_scan_within_subprocess(
228
+ payloads:, payload_timeout:, pattern_matcher:, raw_value_exclusions: [],
229
+ rule_exclusions: [])
230
+ payload_sizes = payloads.map(&:size)
231
+ grouped_payload_indices = group_by_chunk_size(payload_sizes)
232
+
233
+ grouped_payloads = grouped_payload_indices.map { |idx_arr| idx_arr.map { |i| payloads[i] } }
234
+
235
+ found_secrets = Parallel.flat_map(
236
+ grouped_payloads,
237
+ in_processes: MAX_PROCS_PER_REQUEST,
238
+ isolation: true # do not reuse sub-processes
239
+ ) do |grouped_payload|
240
+ grouped_payload.flat_map do |payload|
241
+ Timeout.timeout(payload_timeout) do
242
+ find_secrets_in_payload(
243
+ payload:,
244
+ pattern_matcher:,
245
+ raw_value_exclusions:, rule_exclusions:
246
+ )
247
+ end
248
+ rescue Timeout::Error => e
249
+ logger.error "Secret Detection scan timed out on the payload(id:#{payload.id}): #{e}"
250
+ Core::Finding.new(payload.id, Core::Status::PAYLOAD_TIMEOUT)
251
+ end
252
+ end
253
+
254
+ found_secrets.freeze
255
+ end
256
+
208
257
  # Finds secrets in the given payload guarded with a timeout as a circuit breaker. It accepts
209
258
  # literal values to exclude from the input before the scan, also SD rules to exclude during
210
259
  # the scan.
@@ -268,6 +317,35 @@ module GitLab
268
317
  Core::Status::FOUND_WITH_ERRORS
269
318
  end
270
319
  end
320
+
321
+ # This method accepts an array of payload sizes(in bytes) and groups them into an array
322
+ # of arrays structure where each element is the group of indices of the input
323
+ # array whose cumulative payload sizes has at least +MIN_CHUNK_SIZE_PER_PROC_BYTES+
324
+ def group_by_chunk_size(payload_size_arr)
325
+ cumulative_size = 0
326
+ chunk_indexes = []
327
+ chunk_idx_start = 0
328
+
329
+ payload_size_arr.each_with_index do |size, index|
330
+ cumulative_size += size
331
+ next unless cumulative_size >= MIN_CHUNK_SIZE_PER_PROC_BYTES
332
+
333
+ chunk_indexes << (chunk_idx_start..index).to_a
334
+
335
+ chunk_idx_start = index + 1
336
+ cumulative_size = 0
337
+ end
338
+
339
+ if cumulative_size.positive? && (chunk_idx_start < payload_size_arr.length)
340
+ chunk_indexes << if chunk_idx_start == payload_size_arr.length - 1
341
+ [chunk_idx_start]
342
+ else
343
+ (chunk_idx_start..payload_size_arr.length - 1).to_a
344
+ end
345
+ end
346
+
347
+ chunk_indexes
348
+ end
271
349
  end
272
350
  end
273
351
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab-secret_detection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - group::secret detection
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2024-10-04 00:00:00.000000000 Z
13
+ date: 2024-10-07 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: grpc
@@ -40,6 +40,20 @@ dependencies:
40
40
  - - '='
41
41
  - !ruby/object:Gem::Version
42
42
  version: 1.63.0
43
+ - !ruby/object:Gem::Dependency
44
+ name: parallel
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.19'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.19'
43
57
  - !ruby/object:Gem::Dependency
44
58
  name: re2
45
59
  requirement: !ruby/object:Gem::Requirement