RubyGems - gitlab-secret_detection - Versions diffs - 0.11.1 → 0.39.2 - Mend

gitlab-secret_detection 0.11.1 → 0.39.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/README.md +34 -26
data/lib/gitlab/secret_detection/core/response.rb +16 -6
data/lib/gitlab/secret_detection/core/ruleset.rb +30 -3
data/lib/gitlab/secret_detection/core/scanner.rb +308 -77
data/lib/gitlab/secret_detection/core/secret_push_protection_rules.toml +1072 -0
data/lib/gitlab/secret_detection/core/status.rb +34 -0
data/lib/gitlab/secret_detection/grpc/client/grpc_client.rb +50 -19
data/lib/gitlab/secret_detection/grpc/generated/secret_detection_pb.rb +1 -1
data/lib/gitlab/secret_detection/grpc/integrated_error_tracking.rb +64 -0
data/lib/gitlab/secret_detection/grpc/scanner_service.rb +35 -16
data/lib/gitlab/secret_detection/grpc.rb +1 -0
data/lib/gitlab/secret_detection/utils/masker.rb +43 -0
data/lib/gitlab/secret_detection/utils.rb +1 -0
data/lib/gitlab/secret_detection/version.rb +3 -17
data/proto/secret_detection.proto +3 -0
metadata +209 -19
data/lib/gitlab.rb +0 -6

data/lib/gitlab/secret_detection/core/scanner.rb CHANGED Viewed

@@ -11,14 +11,6 @@ module Gitlab
     module Core
       # Scan is responsible for running Secret Detection scan operation
       class Scanner
-        # RulesetParseError is thrown when the code fails to parse the
-        # ruleset file from the given path
-        RulesetParseError = Class.new(StandardError)
-        # RulesetCompilationError is thrown when the code fails to compile
-        # the predefined rulesets
-        RulesetCompilationError = Class.new(StandardError)
         # default time limit(in seconds) for running the scan operation per invocation
         DEFAULT_SCAN_TIMEOUT_SECS = 180 # 3 minutes
         # default time limit(in seconds) for running the scan operation on a single payload
@@ -32,7 +24,9 @@ module Gitlab
         # run the scan within a new subprocess.
         MIN_CHUNK_SIZE_PER_PROC_BYTES = 2_097_152 # 2MiB
         # Whether to run scan in subprocesses or not. Default is false.
-        RUN_IN_SUBPROCESS = false
+        RUN_IN_SUBPROCESS = ENV.fetch('GITLAB_SD_RUN_IN_SUBPROCESS', false)
+        # Default limit for max findings to be returned in the scan
+        DEFAULT_MAX_FINDINGS_LIMIT = 999
         # Initializes the instance with logger along with following operations:
         # 1. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation.
@@ -46,7 +40,7 @@ module Gitlab
             tags: DEFAULT_PATTERN_MATCHER_TAGS,
             include_missing_tags: false
           )
-          @default_pattern_matcher = build_pattern_matcher(
+          @default_pattern_matcher, @default_rules = build_pattern_matcher(
             tags: DEFAULT_PATTERN_MATCHER_TAGS,
             include_missing_tags: false
           ) # includes only gitlab_blocking rules
@@ -59,13 +53,18 @@ module Gitlab
         # +timeout+:: No of seconds(accepts floating point for smaller time values) to limit the total scan duration
         # +payload_timeout+:: No of seconds(accepts floating point for smaller time values) to limit
         #                  the scan duration on each payload
-        # +raw_value_exclusions:+:: Array of raw values to exclude from the scan.
-        # +rule_exclusions+:: Array of rules to exclude from the ruleset used for the scan. Each rule is represented
-        #           by its ID. For example: `gitlab_personal_access_token` for representing Gitlab Personal Access
-        #           Token. By default, no rule is excluded from the ruleset.
+        # +exclusions+:: Hash with keys: :raw_value, :rule and values of arrays of either
+        #           GRPC::Exclusion objects (when used as a standalone service)
+        #           or Security::ProjectSecurityExclusion objects (when used as gem).
+        #           :raw_value - Exclusions in the :raw array are the raw values to ignore.
+        #           :rule - Exclusions in the :rule array are the rules to exclude from the ruleset used for the scan.
+        #           Each rule is represented by its ID. For example: `gitlab_personal_access_token`
+        #           for representing Gitlab Personal Access Token. By default, no rule is excluded from the ruleset.
         # +tags+:: Array of tag values to filter from the default ruleset when determining the rules used for the scan.
         #           For example: Add `gitlab_blocking` to include only rules for Push Protection. Defaults to
         #           [`gitlab_blocking`] (+DEFAULT_PATTERN_MATCHER_TAGS+).
+        # +max_findings_limit+:: Integer to limit the number of findings to be returned in the scan. Defaults
+        #           to 999 (+DEFAULT_MAX_FINDINGS_LIMIT+).
         #
         # NOTE:
         # Running the scan in fork mode primarily focuses on reducing the memory consumption of the scan by
@@ -84,13 +83,12 @@ module Gitlab
           payloads,
           timeout: DEFAULT_SCAN_TIMEOUT_SECS,
           payload_timeout: DEFAULT_PAYLOAD_TIMEOUT_SECS,
-          raw_value_exclusions: [],
-          rule_exclusions: [],
+          exclusions: {},
           tags: DEFAULT_PATTERN_MATCHER_TAGS,
-          subprocess: RUN_IN_SUBPROCESS
+          subprocess: RUN_IN_SUBPROCESS,
+          max_findings_limit: DEFAULT_MAX_FINDINGS_LIMIT
         )
-          return Core::Response.new(Core::Status::INPUT_ERROR) unless validate_scan_input(payloads)
+          return Core::Response.new(status: Core::Status::INPUT_ERROR) unless validate_scan_input(payloads)
           # assign defaults since grpc passing zero timeout value to `Timeout.timeout(..)` makes it effectively useless.
           timeout = DEFAULT_SCAN_TIMEOUT_SECS unless timeout.positive?
@@ -102,29 +100,53 @@ module Gitlab
             matched_payloads = filter_by_keywords(keyword_matcher, payloads)
-            next Core::Response.new(Core::Status::NOT_FOUND) if matched_payloads.empty?
+            next Core::Response.new(status: Core::Status::NOT_FOUND) if matched_payloads.empty?
-            scan_args = {
-              payloads: matched_payloads, payload_timeout:,
-              pattern_matcher: build_pattern_matcher(tags:),
-              raw_value_exclusions:, rule_exclusions:
-            }
+            # the pattern matcher will filter rules by tags so we use the filtered rule list
+            pattern_matcher, active_rules = build_pattern_matcher(tags:)
-            secrets = subprocess ? run_scan_within_subprocess(**scan_args) : run_scan(**scan_args)
+            scan_args = {
+              payloads: matched_payloads,
+              payload_timeout:,
+              pattern_matcher:,
+              exclusions:,
+              rules: active_rules,
+              max_findings_limit:
+            }.freeze
+            logger.info(
+              message: "Scan input parameters for running Secret Detection scan",
+              timeout:,
+              payload_timeout:,
+              given_total_payloads: payloads.length,
+              scannable_payloads_post_keyword_filter: matched_payloads.length,
+              tags:,
+              run_in_subprocess: subprocess,
+              max_findings_limit:,
+              given_exclusions: format_exclusions_hash(exclusions)
+            )
+            secrets, applied_exclusions = subprocess ? run_scan_within_subprocess(**scan_args) : run_scan(**scan_args)
             scan_status = overall_scan_status(secrets)
-            Core::Response.new(scan_status, secrets)
+            logger.info(
+              message: "Secret Detection scan completed with #{secrets.length} secrets detected in the given payloads",
+              detected_secrets_metadata: format_detected_secrets_metadata(secrets),
+              applied_exclusions: format_exclusions_arr(applied_exclusions)
+            )
+            Core::Response.new(status: scan_status, results: secrets, applied_exclusions:)
           end
         rescue Timeout::Error => e
           logger.error "Secret detection operation timed out: #{e}"
-          Core::Response.new(Core::Status::SCAN_TIMEOUT)
+          Core::Response.new(status: Core::Status::SCAN_TIMEOUT)
         end
         private
-        attr_reader :logger, :rules, :keywords, :default_pattern_matcher, :default_keyword_matcher
+        attr_reader :logger, :rules, :keywords, :default_pattern_matcher, :default_keyword_matcher, :default_rules
         # Builds RE2::Set pattern matcher for the given combination of rules
         # and tags. It also allows a choice(via `include_missing_tags`) to consider rules
@@ -132,31 +154,49 @@ module Gitlab
         # are same as +DEFAULT_PATTERN_MATCHER_TAGS+ then returns the eagerly loaded default
         # pattern matcher created during initialization.
         def build_pattern_matcher(tags:, include_missing_tags: false)
-          return default_pattern_matcher if tags.eql?(DEFAULT_PATTERN_MATCHER_TAGS) && !default_pattern_matcher.nil?
+          if tags.eql?(DEFAULT_PATTERN_MATCHER_TAGS) && !default_pattern_matcher.nil?
+            logger.info(
+              message: "Given tags input matches default matcher tags, using pre-defined RE2 Pattern Matcher"
+            )
+            return [default_pattern_matcher, default_rules]
+          end
+          logger.info(
+            message: "Creating a new RE2 Pattern Matcher with given tags",
+            tags:,
+            include_missing_tags:
+          )
+          active_rules = []
           matcher = RE2::Set.new
-          rules.each do |rule|
-            rule_tags = rule[:tags]
+          begin
+            rules.each do |rule|
+              rule_tags = rule[:tags]
-            include_rule = if tags.empty?
-                             true
-                           elsif rule_tags
-                             tags.intersect?(rule_tags)
-                           else
-                             include_missing_tags
-                           end
+              include_rule = if tags.empty?
+                               true
+                             elsif rule_tags
+                               tags.intersect?(rule_tags)
+                             else
+                               include_missing_tags
+                             end
-            matcher.add(rule[:regex]) if include_rule
+              active_rules << rule if include_rule
+              matcher.add(rule[:regex]) if include_rule
+            end
+          rescue StandardError => e
+            logger.error "Failed to add regex secret detection ruleset in RE::Set: #{e.message}"
+            raise Core::Ruleset::RulesetCompilationError, cause: e
           end
           unless matcher.compile
-            logger.error "Failed to compile secret detection rulesets in RE::Set"
+            logger.error "Failed to compile secret detection ruleset in RE::Set"
-            raise RulesetCompilationError
+            raise Core::Ruleset::RulesetCompilationError
           end
-          matcher
+          [matcher, active_rules]
         end
         # Creates and returns the unique set of rule matching keywords
@@ -171,7 +211,18 @@ module Gitlab
         end
         def build_keyword_matcher(tags:, include_missing_tags: false)
-          return default_keyword_matcher if tags.eql?(DEFAULT_PATTERN_MATCHER_TAGS) && !default_keyword_matcher.nil?
+          if tags.eql?(DEFAULT_PATTERN_MATCHER_TAGS) && !default_keyword_matcher.nil?
+            logger.info(
+              message: "Given tags input matches default tags, using pre-defined RE2 Keyword Matcher"
+            )
+            return default_keyword_matcher
+          end
+          logger.info(
+            message: "Creating a new RE2 Keyword Matcher..",
+            tags:,
+            include_missing_tags:
+          )
           include_keywords = Set.new
@@ -184,15 +235,28 @@ module Gitlab
             include_keywords.merge(rule[:keywords]) unless rule[:keywords].nil?
           end
-          return nil if include_keywords.empty?
+          if include_keywords.empty?
+            logger.error(
+              message: "No rule keywords found a match with given rule tags, returning empty RE2 Keyword Matcher"
+            )
+            return nil
+          end
-          keywords_regex = include_keywords.join('|')
+          keywords_regex = include_keywords.map { |keyword| RE2::Regexp.quote(keyword) }.join('|')
-          RE2("\\b(#{keywords_regex})")
+          logger.debug(
+            message: "Creating RE2 Keyword Matcher with set of rule keywords",
+            keywords: include_keywords.to_a
+          )
+          RE2("(#{keywords_regex})")
         end
         def filter_by_keywords(keyword_matcher, payloads)
-          return [] if keyword_matcher.nil?
+          if keyword_matcher.nil?
+            logger.warn "No RE2 Keyword Matcher instance available, skipping payload filter by rule keywords step.."
+            return payloads
+          end
           matched_payloads = []
           payloads.each do |payload|
@@ -201,73 +265,142 @@ module Gitlab
             matched_payloads << payload
           end
-          matched_payloads.freeze
+          total_payloads_retained = matched_payloads.length == payloads.length ? 'all' : matched_payloads.length
+          log_message = if matched_payloads.empty?
+                          "No payloads available to scan further after keyword-matching, exiting Secret Detection scan"
+                        else
+                          "Retained #{total_payloads_retained} payloads to scan further after keyword-matching step"
+                        end
+          logger.info(
+            message: log_message,
+            given_total_payloads: payloads.length,
+            matched_payloads: matched_payloads.length,
+            payloads_to_scan_further: matched_payloads.map(&:id)
+          )
+          matched_payloads
         end
         # Runs the secret detection scan on the given list of payloads. It accepts
         # literal values to exclude from the input before the scan, also SD rules to exclude during
         # the scan when performed on the payloads.
         def run_scan(
-          payloads:, payload_timeout:, pattern_matcher:, raw_value_exclusions: [], rule_exclusions: [])
-          payloads.flat_map do |payload|
+          payloads:,
+          payload_timeout:,
+          pattern_matcher:,
+          max_findings_limit:,
+          exclusions: {},
+          rules: [])
+          all_applied_exclusions = Set.new
+          logger.info(
+            message: "Running Secret Detection scan sequentially",
+            payload_timeout:
+          )
+          capped_findings = payloads.lazy.flat_map do |payload|
             Timeout.timeout(payload_timeout) do
-              find_secrets_in_payload(
+              findings, applied_exclusions = find_secrets_in_payload(
                 payload:,
                 pattern_matcher:,
-                raw_value_exclusions:, rule_exclusions:
+                exclusions:,
+                rules:
               )
+              all_applied_exclusions.merge(applied_exclusions)
+              findings
             end
           rescue Timeout::Error => e
-            logger.error "Secret Detection scan timed out on the payload(id:#{payload.id}): #{e}"
+            logger.warn "Secret Detection scan timed out on the payload(id:#{payload.id}): #{e}"
             Core::Finding.new(payload.id,
               Core::Status::PAYLOAD_TIMEOUT)
-          end
+          end.take(max_findings_limit).to_a
+          [capped_findings, all_applied_exclusions.to_a]
         end
         def run_scan_within_subprocess(
-          payloads:, payload_timeout:, pattern_matcher:, raw_value_exclusions: [],
-          rule_exclusions: [])
+          payloads:,
+          payload_timeout:,
+          pattern_matcher:,
+          max_findings_limit:,
+          exclusions: {},
+          rules: []
+        )
+          all_applied_exclusions = Set.new
           payload_sizes = payloads.map(&:size)
           grouped_payload_indices = group_by_chunk_size(payload_sizes)
           grouped_payloads = grouped_payload_indices.map { |idx_arr| idx_arr.map { |i| payloads[i] } }
-          found_secrets = Parallel.flat_map(
-            grouped_payloads,
-            in_processes: MAX_PROCS_PER_REQUEST,
-            isolation: true # do not reuse sub-processes
-          ) do |grouped_payload|
-            grouped_payload.flat_map do |payload|
+          logger.info(
+            message: "Running Secret Detection scan within a subprocess",
+            grouped_payloads: grouped_payloads.length,
+            payload_timeout:
+          )
+          found_secrets = []
+          grouped_payloads.each do |grouped_payload|
+            break if found_secrets.length >= max_findings_limit
+            batch_results = Parallel.map(
+              grouped_payload,
+              in_processes: MAX_PROCS_PER_REQUEST,
+              isolation: true # do not reuse sub-processes
+            ) do |payload|
               Timeout.timeout(payload_timeout) do
-                find_secrets_in_payload(
+                findings, applied_exclusions = find_secrets_in_payload(
                   payload:,
                   pattern_matcher:,
-                  raw_value_exclusions:, rule_exclusions:
+                  exclusions:,
+                  rules:
                 )
+                [findings, applied_exclusions]
               end
             rescue Timeout::Error => e
-              logger.error "Secret Detection scan timed out on the payload(id:#{payload.id}): #{e}"
+              logger.warn "Secret Detection scan timed out on the payload(id:#{payload.id}): #{e}"
               Core::Finding.new(payload.id, Core::Status::PAYLOAD_TIMEOUT)
             end
+            # Process results and collect exclusions
+            batch_results.each do |findings, applied_exclusions|
+              all_applied_exclusions.merge(applied_exclusions)
+              remaining_slots = max_findings_limit - found_secrets.length
+              found_secrets.concat(findings.take(remaining_slots))
+              break if found_secrets.length >= max_findings_limit
+            end
           end
-          found_secrets.freeze
+          [found_secrets, all_applied_exclusions.to_a]
         end
         # Finds secrets in the given payload guarded with a timeout as a circuit breaker. It accepts
         # literal values to exclude from the input before the scan, also SD rules to exclude during
         # the scan.
-        def find_secrets_in_payload(payload:, pattern_matcher:, raw_value_exclusions: [], rule_exclusions: [])
+        def find_secrets_in_payload(payload:, pattern_matcher:, exclusions: {}, rules: @default_rules)
           findings = []
+          applied_exclusions = Set.new
           payload_offset = payload.respond_to?(:offset) ? payload.offset : 0
+          raw_value_exclusions = exclusions.fetch(:raw_value, [])
+          rule_exclusions = exclusions.fetch(:rule, [])
           payload.data
                  .each_line($INPUT_RECORD_SEPARATOR, chomp: true)
                  .each_with_index do |line, index|
             unless raw_value_exclusions.empty?
-              raw_value_exclusions.each do |value|
-                line.gsub!(value, '') # replace input that doesn't contain allowed value in it
+              raw_value_exclusions.each do |exclusion|
+                # replace input that doesn't contain allowed value in it
+                # replace exclusion value, `.gsub!` returns 'self' if replaced otherwise 'nil'
+                excl_replaced = !!line.gsub!(exclusion.value, '')
+                applied_exclusions << exclusion if excl_replaced
               end
             end
@@ -284,27 +417,56 @@ module Gitlab
             matches.each do |match_idx|
               rule = rules[match_idx]
-              next if rule_exclusions.include?(rule[:id])
+              next if applied_rule_exclusion?(rule[:id], rule_exclusions, applied_exclusions)
+              title = rule[:title].nil? ? rule[:description] : rule[:title]
-              findings << Core::Finding.new(payload.id, Core::Status::FOUND, line_no, rule[:id],
-                rule[:description])
+              findings << Core::Finding.new(
+                payload.id,
+                Core::Status::FOUND,
+                line_no,
+                rule[:id],
+                title
+              )
             end
           end
-          findings.freeze
+          logger.info(
+            message: "Secret Detection scan found #{findings.length} secret leaks in the payload(id:#{payload.id})",
+            payload_id: payload.id,
+            detected_rules: findings.map { |f| "#{f.type}:#{f.line_number}" },
+            applied_exclusions: format_exclusions_arr(applied_exclusions)
+          )
+          [findings, applied_exclusions]
         rescue StandardError => e
           logger.error "Secret Detection scan failed on the payload(id:#{payload.id}): #{e}"
-          Core::Finding.new(payload.id, Core::Status::SCAN_ERROR)
+          [[Core::Finding.new(payload.id, Core::Status::SCAN_ERROR)], []]
+        end
+        def applied_rule_exclusion?(type, rule_exclusions, applied_exclusions)
+          applied_exclusion = rule_exclusions&.find { |rule_exclusion| rule_exclusion.value == type }
+          applied_exclusion && (applied_exclusions << applied_exclusion)
         end
         # Validates the given payloads by verifying the type and
         # presence of `id` and `data` fields necessary for the scan
         def validate_scan_input(payloads)
-          return false if payloads.nil? || !payloads.instance_of?(Array)
+          if payloads.nil? || !payloads.instance_of?(Array)
+            logger.debug(message: "Scan input validation error: payloads arg is empty or not instance of array")
+            return false
+          end
           payloads.all? do |payload|
-            payload.respond_to?(:id) && payload.respond_to?(:data)
+            has_valid_fields = payload.respond_to?(:id) && payload.respond_to?(:data) && payload.data.is_a?(String)
+            unless has_valid_fields
+              logger.debug(
+                message: "Scan input validation error: one of the payloads does not respond to `id` or `data`"
+              )
+            end
+            has_valid_fields
           end
         end
@@ -353,6 +515,75 @@ module Gitlab
           chunk_indexes
         end
+        # Returns array of strings with each representing a masked exclusion
+        #
+        # Example: For given arg exclusions = {
+        #     rule: ["gitlab_personal_access_token", "aws_key"],
+        #     path: ["test.py"],
+        #     raw_value: ["ABC123XYZ"]
+        # }
+        #
+        # The output will look like the following:
+        # [
+        #   "rule=gitlab_personal_access_token,aws_key",
+        #   "raw_value=AB*****YZ",
+        #   "paths=test.py"
+        # ]
+        def format_exclusions_hash(exclusions = {})
+          masked_raw_values = exclusions.fetch(:raw_value, []).map do |exclusion|
+            Gitlab::SecretDetection::Utils::Masker.mask_secret(exclusion.value)
+          end.join(", ")
+          paths = exclusions.fetch(:path, []).map(&:value).join(", ")
+          rules = exclusions.fetch(:rule, []).map(&:value).join(", ")
+          out = []
+          out << "rules=#{rules}" unless rules.empty?
+          out << "raw_values=#{masked_raw_values}" unless masked_raw_values.empty?
+          out << "paths=#{paths}" unless paths.empty?
+          out
+        end
+        def format_exclusions_arr(exclusions = [])
+          return [] if exclusions.empty?
+          masked_raw_values = Set.new
+          paths = Set.new
+          rules = Set.new
+          exclusions.each do |exclusion|
+            case exclusion.exclusion_type
+            when :EXCLUSION_TYPE_RAW_VALUE
+              masked_raw_values << Gitlab::SecretDetection::Utils::Masker.mask_secret(exclusion.value)
+            when :EXCLUSION_TYPE_RULE
+              rules << exclusion.value
+            when :EXCLUSION_TYPE_PATH
+              paths << exclusion.value
+            else
+              logger.warn("Unknown exclusion type #{exclusion.exclusion_type}")
+            end
+          end
+          out = []
+          out << "rules=#{rules.join(',')}" unless rules.empty?
+          out << "raw_values=#{masked_raw_values.join(',')}" unless masked_raw_values.empty?
+          out << "paths=#{paths.join(',')}" unless paths.empty?
+          out
+        end
+        def format_detected_secrets_metadata(findings = [])
+          return [] if findings.empty?
+          found_secrets = findings.filter do |f|
+            f.status == Core::Status::FOUND
+          end
+          found_secrets.map { |f| "#{f.payload_id}=>#{f.type}:#{f.line_number}" }
+        end
       end
     end
   end