gitlab-secret_detection 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7ff1ed2f6fa1d52463144cfacf4a23bd191822d6660e5f4d3c15cefb349b9dd
4
- data.tar.gz: 13af55c3efd41733108968de3d0c8d03648c726947170636a293db174521a81e
3
+ metadata.gz: 648b6d5277ac8e7948762533af39dc2f4ff0ae4c62fbdcc6d5d32615a44ab815
4
+ data.tar.gz: afd2a580cb0a73bb84a401616ec622e1e14d2b75021036a959f933ed75864cf6
5
5
  SHA512:
6
- metadata.gz: 6698802604dffeff97940812c8463dbb21c73698c8f438e825b482d5dc68c97953aaa983d03af7de26f37ff9ec7d1e1418966e495c2ef8f308eb05f40d4ae601
7
- data.tar.gz: b6e1b308dddfa644500184058228faf67458702c82994fc9c475ce03ef47a51323f309c1163287a2d1ce0bbd13aaec2a19c71e4a8ca711f6a6beaefbda2649d1
6
+ metadata.gz: 7c49b71891f6e13d8dc252936f18df838d8e9d42000cbc1f4f3d7fe56b258a7e2c17bce7a7f0e771bc16991fa46bacf641a636a1d1efd3ef23de6b0a31a011b8
7
+ data.tar.gz: af93c82a0025be12bc82fe6ff62b290fe193a2338ae2f98d83bc2729d1c7cc35a223330a3a9907a9f715c009127e93ba2d13fc33982d2f13a179d8548f5cc36e
@@ -4,6 +4,7 @@ require 're2'
4
4
  require 'logger'
5
5
  require 'timeout'
6
6
  require 'English'
7
+ require 'parallel'
7
8
 
8
9
  module GitLab
9
10
  module SecretDetection
@@ -24,6 +25,14 @@ module GitLab
24
25
  DEFAULT_PAYLOAD_TIMEOUT_SECS = 30 # 30 seconds
25
26
  # Tags used for creating default pattern matcher
26
27
  DEFAULT_PATTERN_MATCHER_TAGS = ['gitlab_blocking'].freeze
28
+ # Max no of child processes to spawn per request
29
+ # ref: https://gitlab.com/gitlab-org/gitlab/-/issues/430160
30
+ MAX_PROCS_PER_REQUEST = 5
31
+ # Minimum cumulative size of the payloads required to spawn and
32
+ # run the scan within a new subprocess.
33
+ MIN_CHUNK_SIZE_PER_PROC_BYTES = 2_097_152 # 2MiB
34
+ # Whether to run scan in subprocesses or not. Default is false.
35
+ RUN_IN_SUBPROCESS = false
27
36
 
28
37
  # Initializes the instance with logger along with following operations:
29
38
  # 1. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation.
@@ -58,6 +67,13 @@ module GitLab
58
67
  # For example: Add `gitlab_blocking` to include only rules for Push Protection. Defaults to
59
68
  # [`gitlab_blocking`] (+DEFAULT_PATTERN_MATCHER_TAGS+).
60
69
  #
70
+ # NOTE:
71
+ # Running the scan in fork mode primarily focuses on reducing the memory consumption of the scan by
72
+ # offloading regex operations on large payloads to sub-processes. However, it does not assure the improvement
73
+ # in the overall latency of the scan, specifically in the case of smaller payloads, where the overhead of
74
+ # forking a new process adds to the overall latency of the scan instead. More reference on Subprocess-based
75
+ # execution is found here: https://gitlab.com/gitlab-org/gitlab/-/issues/430160.
76
+ #
61
77
  # Returns an instance of GitLab::SecretDetection::Core::Response by following below structure:
62
78
  # {
63
79
  # status: One of the Core::Status values
@@ -70,7 +86,8 @@ module GitLab
70
86
  payload_timeout: DEFAULT_PAYLOAD_TIMEOUT_SECS,
71
87
  raw_value_exclusions: [],
72
88
  rule_exclusions: [],
73
- tags: DEFAULT_PATTERN_MATCHER_TAGS
89
+ tags: DEFAULT_PATTERN_MATCHER_TAGS,
90
+ subprocess: RUN_IN_SUBPROCESS
74
91
  )
75
92
 
76
93
  return Core::Response.new(Core::Status::INPUT_ERROR) unless validate_scan_input(payloads)
@@ -87,11 +104,13 @@ module GitLab
87
104
 
88
105
  next Core::Response.new(Core::Status::NOT_FOUND) if matched_payloads.empty?
89
106
 
90
- secrets = run_scan(
107
+ scan_args = {
91
108
  payloads: matched_payloads, payload_timeout:,
92
109
  pattern_matcher: build_pattern_matcher(tags:),
93
110
  raw_value_exclusions:, rule_exclusions:
94
- )
111
+ }
112
+
113
+ secrets = subprocess ? run_scan_within_subprocess(**scan_args) : run_scan(**scan_args)
95
114
 
96
115
  scan_status = overall_scan_status(secrets)
97
116
 
@@ -205,6 +224,36 @@ module GitLab
205
224
  end
206
225
  end
207
226
 
227
+ def run_scan_within_subprocess(
228
+ payloads:, payload_timeout:, pattern_matcher:, raw_value_exclusions: [],
229
+ rule_exclusions: [])
230
+ payload_sizes = payloads.map(&:size)
231
+ grouped_payload_indices = group_by_chunk_size(payload_sizes)
232
+
233
+ grouped_payloads = grouped_payload_indices.map { |idx_arr| idx_arr.map { |i| payloads[i] } }
234
+
235
+ found_secrets = Parallel.flat_map(
236
+ grouped_payloads,
237
+ in_processes: MAX_PROCS_PER_REQUEST,
238
+ isolation: true # do not reuse sub-processes
239
+ ) do |grouped_payload|
240
+ grouped_payload.flat_map do |payload|
241
+ Timeout.timeout(payload_timeout) do
242
+ find_secrets_in_payload(
243
+ payload:,
244
+ pattern_matcher:,
245
+ raw_value_exclusions:, rule_exclusions:
246
+ )
247
+ end
248
+ rescue Timeout::Error => e
249
+ logger.error "Secret Detection scan timed out on the payload(id:#{payload.id}): #{e}"
250
+ Core::Finding.new(payload.id, Core::Status::PAYLOAD_TIMEOUT)
251
+ end
252
+ end
253
+
254
+ found_secrets.freeze
255
+ end
256
+
208
257
  # Finds secrets in the given payload guarded with a timeout as a circuit breaker. It accepts
209
258
  # literal values to exclude from the input before the scan, also SD rules to exclude during
210
259
  # the scan.
@@ -268,6 +317,35 @@ module GitLab
268
317
  Core::Status::FOUND_WITH_ERRORS
269
318
  end
270
319
  end
320
+
321
+ # This method accepts an array of payload sizes(in bytes) and groups them into an array
322
+ # of arrays structure where each element is the group of indices of the input
323
+ # array whose cumulative payload sizes has at least +MIN_CHUNK_SIZE_PER_PROC_BYTES+
324
+ def group_by_chunk_size(payload_size_arr)
325
+ cumulative_size = 0
326
+ chunk_indexes = []
327
+ chunk_idx_start = 0
328
+
329
+ payload_size_arr.each_with_index do |size, index|
330
+ cumulative_size += size
331
+ next unless cumulative_size >= MIN_CHUNK_SIZE_PER_PROC_BYTES
332
+
333
+ chunk_indexes << (chunk_idx_start..index).to_a
334
+
335
+ chunk_idx_start = index + 1
336
+ cumulative_size = 0
337
+ end
338
+
339
+ if cumulative_size.positive? && (chunk_idx_start < payload_size_arr.length)
340
+ chunk_indexes << if chunk_idx_start == payload_size_arr.length - 1
341
+ [chunk_idx_start]
342
+ else
343
+ (chunk_idx_start..payload_size_arr.length - 1).to_a
344
+ end
345
+ end
346
+
347
+ chunk_indexes
348
+ end
271
349
  end
272
350
  end
273
351
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab-secret_detection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - group::secret detection
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2024-10-04 00:00:00.000000000 Z
13
+ date: 2024-10-07 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: grpc
@@ -40,6 +40,20 @@ dependencies:
40
40
  - - '='
41
41
  - !ruby/object:Gem::Version
42
42
  version: 1.63.0
43
+ - !ruby/object:Gem::Dependency
44
+ name: parallel
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.19'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.19'
43
57
  - !ruby/object:Gem::Dependency
44
58
  name: re2
45
59
  requirement: !ruby/object:Gem::Requirement