RubyGems - sas-linter - Versions diffs - 0.1.0 - Mend

sas-linter 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +7 -0
data/LICENSE +661 -0
data/README.md +140 -0
data/Rakefile +11 -0
data/bin/sas_lint +79 -0
data/lib/sas_linter/rules/choose_one_template.rb +61 -0
data/lib/sas_linter/rules/commented_out_guard.rb +59 -0
data/lib/sas_linter/rules/encoding_issues.rb +322 -0
data/lib/sas_linter/rules/identical_if_else_branches.rb +104 -0
data/lib/sas_linter/rules/line_endings.rb +105 -0
data/lib/sas_linter/rules/malformed_if_condition.rb +291 -0
data/lib/sas_linter/rules/missing_assignment_semicolon.rb +141 -0
data/lib/sas_linter/rules/source_headers.rb +290 -0
data/lib/sas_linter/rules/tab_expansion.rb +98 -0
data/lib/sas_linter/rules/trailing_whitespace.rb +53 -0
data/lib/sas_linter/rules/unreachable_inner_branch_value.rb +202 -0
data/lib/sas_linter/rules/variable_value_out_of_known_range.rb +280 -0
data/lib/sas_linter/version.rb +5 -0
data/lib/sas_linter.rb +287 -0
metadata +96 -0

data/lib/sas_linter/rules/source_headers.rb ADDED Viewed

@@ -0,0 +1,290 @@
+# frozen_string_literal: true
+require "set"
+require_relative "../../sas_linter"
+require "sas_lexer"
+class SasLinter
+  module Rules
+    # Restore the standard 90-char `**...**;` header convention to broken SAS
+    # source files. Detects header lines that *look* like `**`-comments
+    # but produce DEFAULT-channel tokens, and re-wraps them as proper
+    # `**  ...  **;` rows.
+    #
+    # Working sources use a uniform 90-char-wide header where each
+    # line is its own self-contained `*` comment statement:
+    #
+    #     ****************************************************************************************;
+    #     **  PROGRAM:          ...                                                            **;
+    #     **  BY:               ...                                                            **;
+    #
+    # Broken sources have lines that look like comments (start with
+    # `**`) but produce DEFAULT-channel tokens. Two flavors:
+    #
+    #   A. Missing trailing `;` on every header line — the whole
+    #      header is one giant unterminated `*` comment until the
+    #      first inline `;` ends it, leaking the rest of that
+    #      physical line and following lines onto DEFAULT.
+    #
+    #   B. Trailing `**;` is present but an inline `;` (e.g. a
+    #      semicolon-separated list like `First Reviewer; Second
+    #      Reviewer`) terminates the comment in the middle of the
+    #      line — what follows the inline `;` ends up on DEFAULT
+    #      even though the line *looks* terminated.
+    #
+    # Some files also have header continuation lines (text that
+    # should be inside a `**` comment) that lost their `**` prefix
+    # during a text-conversion step. Those are detected only inside
+    # the file's leading header block — *before* the first KW_DATA /
+    # KW_PROC token the lexer reports — so legitimate body code
+    # sandwiched between `**` marker comments is left alone.
+    #
+    # Recognized config options:
+    #   autofix: true | false   (default: false)
+    class SourceHeaders < Rule
+      rule_id :source_headers
+      description "Header lines look like `**`-comments but lex as code; will be re-wrapped."
+      severity :warning
+      TARGET_WIDTH = 90
+      PAD_TO       = TARGET_WIDTH - 3 # leave 3 chars for trailing `**;`
+      DEFAULT_CHANNEL = SasLexer::Lexer::TokenChannel::DEFAULT
+      KW_DATA         = SasLexer::Lexer::TokenType::KW_DATA
+      KW_PROC         = SasLexer::Lexer::TokenType.const_get(:KW_PROC) if SasLexer::Lexer::TokenType.const_defined?(:KW_PROC)
+      C_STYLE_COMMENT = SasLexer::Lexer::TokenType::C_STYLE_COMMENT
+      IDENTIFIER      = SasLexer::Lexer::TokenType::IDENTIFIER
+      SEMI            = SasLexer::Lexer::TokenType::SEMI
+      ASSIGN          = SasLexer::Lexer::TokenType::ASSIGN
+      def self.supports_autofix?
+        true
+      end
+      def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        return [] unless source
+        broken_header_lines(source).map do |line_idx|
+          finding(
+            line: line_idx + 1,
+            column: 1,
+            message: "broken header line#{autofix? ? ' (autofixed)' : ''}",
+            path: path
+          )
+        end
+      end
+      def autofix(source)
+        # Step 0: expand any tab characters to 4 spaces. Tabs in
+        # SAS source headers often come from Word docs, and
+        # break the column-alignment of the header box. Doing this
+        # first means every downstream check sees consistent column
+        # offsets.
+        text = source.gsub("\t", "    ")
+        10.times do
+          tokens = tokenize(text)
+          skip   = c_comment_lines(tokens)
+          bad    = broken_lines_for(text, tokens, skip) |
+                   asterisk_rows_missing_semi_for(text, skip)
+          break if bad.empty?
+          text = rewrite(text, bad)
+        end
+        text
+      end
+      # 0-indexed line numbers the lexer thinks are broken header
+      # text in `source`. Public so the rule's `check` can produce
+      # findings without re-tokenizing on its own.
+      def broken_header_lines(source)
+        tokens = tokenize(source)
+        broken_lines_for(source, tokens, c_comment_lines(tokens))
+      end
+      private
+      # Lex `text`. The Rust lexer demands valid UTF-8; some legacy SAS
+      # sources ship with stray Windows-1252 bytes (smart quotes). We
+      # make a UTF-8-safe copy for the lexer call, then operate on
+      # the original byte string for offset math — the byte positions
+      # line up because we only replace bytes, never insert or delete.
+      def tokenize(text)
+        utf8 = text.dup.force_encoding(Encoding::UTF_8)
+        utf8 = utf8.scrub("?") unless utf8.valid_encoding?
+        lexer = SasLexer::Lexer.new
+        begin
+          lexer.tokenize(utf8)
+        ensure
+          lexer.free
+        end
+      end
+      # 0-indexed line number of the first body keyword (KW_DATA /
+      # KW_PROC). Lines at or after this cutoff are body code, not
+      # header. Falls back to `total_lines` for fragments that have
+      # no data/proc step.
+      def header_cutoff_line(tokens, total_lines)
+        first_body = tokens.find do |t|
+          t[:type] == KW_DATA || (KW_PROC && t[:type] == KW_PROC)
+        end
+        first_body ? first_body[:start_line] - 1 : total_lines
+      end
+      # Set of 0-indexed line numbers that fall inside a `/* ... */`
+      # C_STYLE_COMMENT token. Legacy SAS sources sometimes embed
+      # large code blocks in such comments; header repair must skip
+      # those lines.
+      def c_comment_lines(tokens)
+        lines = Set.new
+        tokens.each do |tok|
+          next unless tok[:type] == C_STYLE_COMMENT
+          ((tok[:start_line] - 1)..(tok[:end_line] - 1)).each { |ln| lines << ln }
+        end
+        lines
+      end
+      # A line is "prose-only" iff its DEFAULT-channel tokens contain
+      # no SAS-syntax control tokens (no `;`, no `=`). Real body code
+      # always has at least one of those; prose ("CHECK WITH AUTHOR
+      # FOR OTHERS") has neither.
+      def prose_only_line?(tokens, line_idx)
+        saw_default = false
+        tokens.each do |tok|
+          next unless tok[:start_line] - 1 == line_idx
+          next unless tok[:channel] == DEFAULT_CHANNEL
+          saw_default = true
+          return false if tok[:type] == SEMI || tok[:type] == ASSIGN
+        end
+        saw_default
+      end
+      def broken_lines_for(text, tokens, skip_lines)
+        lines     = text.split("\n", -1)
+        cutoff_ln = header_cutoff_line(tokens, lines.length)
+        bad = Set.new
+        # Pattern A: the Rust lexer reports a DEFAULT-channel
+        # IDENTIFIER on a line that's otherwise a `**` comment block.
+        # IDENTIFIERs are the diagnostic shape — when prose (e.g. a
+        # list of reviewers separated by `;`) leaks past an inline
+        # `;` it lexes as variable references. A bare DEFAULT SEMI
+        # from `**A; **B; ;` is a harmless null statement and must
+        # not flag the line.
+        default_lines = Set.new
+        tokens.each do |tok|
+          next unless tok[:channel] == DEFAULT_CHANNEL && tok[:type] == IDENTIFIER
+          default_lines << (tok[:start_line] - 1)
+        end
+        default_lines.each do |i|
+          next if skip_lines.include?(i)
+          line = lines[i] or next
+          if line.lstrip.start_with?("**")
+            # Skip lines that already look properly terminated
+            # `**  ...  **;`. If the lexer reports default-channel
+            # IDENTIFIERs on such a line, it's almost always because
+            # something *upstream* is unterminated (e.g. a missing
+            # `;` after `value foo 0='x' 1='y'`) — re-padding this
+            # line won't fix the upstream problem.
+            next if line.rstrip.end_with?("**;")
+            bad << i
+          elsif i < cutoff_ln
+            prev = nearest_nonblank(lines, i, -1)
+            nxt  = nearest_nonblank(lines, i, +1)
+            next unless prev&.lstrip&.start_with?("**") && nxt&.lstrip&.start_with?("**")
+            # Stricter than just "sandwiched": require the line itself
+            # to be prose only. This protects body code (`A=0;`,
+            # `if x then y;`) that happens to sit between `**` marker
+            # comments.
+            bad << i if prose_only_line?(tokens, i)
+          end
+        end
+        # No textual heuristic for "header-shaped lines without
+        # trailing `;`" (formerly Pattern C). The SAS lexer accepts
+        # plenty of shapes the heuristic flagged —
+        # `** START OF SAS CODE **` (no `;`),
+        # `**  REVISION DATES:  03/15/12; 10/07/2025  **;` (inline
+        # `;` in prose with proper end terminator), `**...**:`
+        # (colon instead of semicolon) — and Pattern A above already
+        # catches every line where default-channel code actually
+        # leaks. Cosmetic-only re-padding is not worth the diff churn.
+        bad
+      end
+      def nearest_nonblank(lines, from, step)
+        i = from + step
+        while i >= 0 && i < lines.length
+          return lines[i] unless lines[i].strip.empty?
+          i += step
+        end
+        nil
+      end
+      def asterisk_rows_missing_semi_for(text, skip_lines)
+        bad = Set.new
+        text.split("\n", -1).each_with_index do |line, i|
+          next if skip_lines.include?(i)
+          bad << i if line.strip.match?(/\A\*+\z/) && !line.rstrip.end_with?(";")
+        end
+        bad
+      end
+      def rewrite(text, bad)
+        lines = text.split("\n", -1)
+        out = []
+        lines.each_with_index do |line, i|
+          if bad.include?(i)
+            out.concat(rewrite_line(line))
+          else
+            out << line
+          end
+        end
+        out.join("\n")
+      end
+      # Rewrite one broken line into one or more proper `**  ...  **;`
+      # lines.
+      def rewrite_line(line)
+        stripped = line.rstrip
+        return ["#{stripped};"] if stripped.match?(/\A\*+\z/)
+        # Continuation line missing `**` prefix — re-add it.
+        stripped = "**  #{stripped.lstrip}" unless stripped.start_with?("**")
+        # Strip an existing trailing `**;` or `;` so we re-pad
+        # consistently.
+        stripped = if stripped.end_with?("**;")
+                     stripped[0..-4].rstrip
+                   elsif stripped.end_with?(";")
+                     stripped[0..-2].rstrip
+                   else
+                     stripped
+                   end
+        # Split only on `\s+\*\*\s+` — the signature of two
+        # `**...**;` comments that lost their line break. Inline `;`
+        # mid-prose is preserved as-is: once we append a trailing
+        # `**;`, the SAS lexer's predictive `**...**;` recognition
+        # consumes the whole line as one COMMENT-channel token, so
+        # the inline `;` no longer closes the comment early.
+        segments = stripped.split(/\s+\*\*\s+/)
+        segments.each_with_index.map do |seg, idx|
+          text = idx.zero? ? seg.rstrip : "**  #{seg.strip}"
+          text = "**  #{text}" unless text.start_with?("**")
+          text = text.ljust(PAD_TO) if text.length < PAD_TO
+          "#{text}**;"
+        end
+      end
+    end
+  end
+end

data/lib/sas_linter/rules/tab_expansion.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+class SasLinter
+  module Rules
+    # Flag literal TAB (`\t`) characters in source. SAS authoring
+    # conventions strongly prefer spaces — tabs render at different
+    # widths in different editors and break the column alignment
+    # SAS sources often rely on for readability.
+    #
+    # When `autofix` is true, each tab is replaced with the number
+    # of spaces needed to reach the next column-aligned tab stop
+    # (i.e., the standard `expand(1)` semantics with the configured
+    # width). A tab in column N expands to `width - (N % width)`
+    # spaces, so leading whitespace, mid-line alignment, and pre-
+    # token padding all stay column-aligned post-fix.
+    #
+    # Recognized config options:
+    #   width:   integer (default 8)
+    #   autofix: true | false (default false)
+    class TabExpansion < Rule
+      rule_id :tab_expansion
+      description "Line contains a literal TAB character; will be expanded to spaces."
+      severity :warning
+      DEFAULT_WIDTH = 8
+      def self.supports_autofix?
+        true
+      end
+      def self.from_config(opts = {})
+        opts = opts.transform_keys(&:to_s)
+        new(
+          width: Integer(opts.fetch("width", DEFAULT_WIDTH)),
+          autofix: opts["autofix"] ? true : false
+        )
+      end
+      attr_reader :width
+      def initialize(width: DEFAULT_WIDTH, autofix: false)
+        super(autofix: autofix)
+        raise ArgumentError, "width must be positive (got #{width})" if width.to_i < 1
+        @width = Integer(width)
+      end
+      def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        return [] unless source
+        findings = []
+        source.each_line.with_index do |line, idx|
+          chomped = line.sub(/\r?\n\z/, "")
+          next unless chomped.include?("\t")
+          chomped.each_char.with_index do |ch, col|
+            next unless ch == "\t"
+            findings << finding(
+              line: idx + 1,
+              column: col + 1,
+              message: "tab character#{autofix? ? " (expanded to #{@width}-space tab stop)" : ''}",
+              path: path
+            )
+          end
+        end
+        findings
+      end
+      # Replace every tab with `width - (col % width)` spaces, where
+      # `col` is the post-expansion column of the tab. Re-counts per
+      # line so the line terminator resets the column.
+      def autofix(source)
+        source.each_line.map { |line| expand_line(line) }.join
+      end
+      private
+      def expand_line(line)
+        eol_match = line.match(/\r?\n\z/)
+        terminator = eol_match ? eol_match[0] : ""
+        body = eol_match ? line[0...eol_match.begin(0)] : line
+        out = +""
+        body.each_char do |ch|
+          if ch == "\t"
+            out << (" " * (@width - (out.length % @width)))
+          else
+            out << ch
+          end
+        end
+        out + terminator
+      end
+    end
+  end
+end

data/lib/sas_linter/rules/trailing_whitespace.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+class SasLinter
+  module Rules
+    # Flag end-of-line trailing whitespace (spaces or tabs that
+    # appear before the line terminator). Trailing whitespace is
+    # invisible noise — it inflates diffs, fights with editor
+    # auto-trim, and hides intent. Supports `autofix` to strip the
+    # offending bytes in place.
+    #
+    # Recognized config options:
+    #   autofix: true | false   (default: false)
+    class TrailingWhitespace < Rule
+      rule_id :trailing_whitespace
+      description "Line has trailing whitespace before the newline."
+      severity :warning
+      TRAILING_WS = /([ \t]+)(\r?\n|\z)/
+      def self.supports_autofix?
+        true
+      end
+      def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        return [] unless source
+        findings = []
+        source.each_line.with_index do |line, idx|
+          chomped = line.sub(/\r?\n\z/, "")
+          next unless chomped =~ /([ \t]+)\z/
+          ws_start = ::Regexp.last_match.begin(1)
+          findings << finding(
+            line: idx + 1,
+            column: ws_start + 1,
+            message: "trailing whitespace#{autofix? ? ' (autofixed)' : ''}",
+            path: path
+          )
+        end
+        findings
+      end
+      # Strip end-of-line trailing whitespace while preserving the
+      # original line terminator (LF or CRLF) and the trailing
+      # newline (or its absence) on the final line.
+      def autofix(source)
+        source.gsub(TRAILING_WS) { ::Regexp.last_match(2) }
+      end
+    end
+  end
+end

data/lib/sas_linter/rules/unreachable_inner_branch_value.rb ADDED Viewed

@@ -0,0 +1,202 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+require "sas_lexer"
+class SasLinter
+  module Rules
+    # Flag inner branches whose comparison values are excluded by an
+    # enclosing `if VAR in (...) then do; ... end;` guard.
+    #
+    # Motivating shape: an outer guard
+    # `if RANK in (0,1,2,3,4,5,6,8) then do;` omits 7, while an inner
+    # `if RANK in (5,6,7,8) then cOut = 2;` lists 7. Value 7 falls
+    # through the outer guard, so the inner branch can never fire for it
+    # and cOut silently stays missing.
+    #
+    # Detection: outer guard pushes a {var, allowed_set} frame; inner
+    # `if VAR in (...)`, `if VAR = N`, or `if VAR eq N` references inside
+    # the same DO block are checked against that set. Values absent from
+    # the outer set produce a finding.
+    class UnreachableInnerBranchValue < Rule
+      rule_id :unreachable_inner_branch_value
+      description "Inner branch references a value that the enclosing " \
+                  "outer guard excludes — branch is unreachable for that value."
+      severity :warning
+      TT = SasLexer::Lexer::TokenType
+      # Outer guard pattern: KW_IF IDENT KW_IN LPAREN <lits...> RPAREN KW_THEN KW_DO SEMI
+      # Inner check patterns:
+      #   KW_IF IDENT(V) KW_IN LPAREN <lits...> RPAREN
+      #   KW_IF IDENT(V) KW_EQ <lit>
+      #   KW_IF IDENT(V) ASSIGN <lit>     (SAS uses `=` as comparison in IF)
+      def check(tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        findings = []
+        guard_stack = [] # array of {var:, allowed:, depth:}
+        do_depth = 0
+        i = 0
+        while i < tokens.length
+          tok = tokens[i]
+          if tok[:type] == TT::KW_IF
+            consumed, frame, inner_findings =
+              analyze_if(tokens, i, do_depth, guard_stack, path)
+            findings.concat(inner_findings)
+            if frame
+              guard_stack.push(frame)
+              do_depth += 1
+            end
+            i += consumed
+            next
+          end
+          if tok[:type] == TT::KW_DO
+            # bare `do;` (no IF prefix), or `do i = 1 to N;` — both increment depth
+            do_depth += 1
+            i += 1
+            next
+          end
+          if tok[:type] == TT::KW_END
+            do_depth -= 1 if do_depth > 0
+            guard_stack.pop while guard_stack.last && guard_stack.last[:depth] > do_depth
+            i += 1
+            next
+          end
+          i += 1
+        end
+        findings
+      end
+      private
+      # Returns [tokens_consumed, new_guard_frame_or_nil, findings].
+      # Skips ahead through the entire condition expression but not the body.
+      def analyze_if(tokens, i, do_depth, guard_stack, path)
+        # tokens[i] is KW_IF
+        j = i + 1
+        ident = tokens[j]
+        return [1, nil, []] unless ident && ident[:type] == TT::IDENTIFIER
+        var = ident[:text].downcase
+        op = tokens[j + 1]
+        return [1, nil, []] unless op
+        values, end_of_cond, simple = parse_comparison(tokens, j + 1, var, ident[:text])
+        return [1, nil, []] unless simple
+        # Now look for `then do;` immediately after end_of_cond to detect outer guards
+        k = end_of_cond
+        is_outer_guard =
+          tokens[k] && tokens[k][:type] == TT::KW_THEN &&
+          tokens[k + 1] && tokens[k + 1][:type] == TT::KW_DO &&
+          tokens[k + 2] && tokens[k + 2][:type] == TT::SEMI
+        # Generate findings for any active guard on this variable. (Skip the
+        # outer guard itself — its own values define the allowed set.)
+        findings = []
+        unless is_outer_guard
+          active = guard_stack.reverse.find { |f| f[:var] == var }
+          if active
+            values.each do |val|
+              next if active[:allowed].include?(val[:key])
+              findings << finding(
+                line: val[:line],
+                column: val[:column],
+                message: "value #{val[:display]} for #{ident[:text]} is excluded by " \
+                         "the enclosing `if #{ident[:text]} in (...)` guard at line #{active[:line]}; " \
+                         "this branch is unreachable.",
+                path: path
+              )
+            end
+          end
+        end
+        new_frame = nil
+        consumed = (end_of_cond - i)
+        if is_outer_guard
+          new_frame = {
+            var: var,
+            allowed: values.map { |v| v[:key] }.to_set,
+            depth: do_depth + 1,
+            line: tokens[i][:start_line]
+          }
+          consumed = (k + 3) - i # consume through SEMI
+        end
+        [consumed, new_frame, findings]
+      end
+      # Parse one of:
+      #   KW_IN  LPAREN <lits...> RPAREN
+      #   KW_EQ  <lit>
+      #   ASSIGN <lit>
+      # Returns [values, index_after_condition, simple?].
+      # `values` is array of {key:, display:, line:, column:}.
+      # `simple?` is false if the condition contains anything we can't reason
+      # about (macros, references, expressions) — caller bails.
+      def parse_comparison(tokens, op_idx, _var, _orig_text)
+        op = tokens[op_idx]
+        return [[], op_idx, false] unless op
+        case op[:type]
+        when TT::KW_IN
+          lparen = tokens[op_idx + 1]
+          return [[], op_idx, false] unless lparen && lparen[:type] == TT::LPAREN
+          values = []
+          k = op_idx + 2
+          loop do
+            t = tokens[k]
+            return [[], op_idx, false] unless t
+            if t[:type] == TT::RPAREN
+              return [values, k + 1, true]
+            elsif t[:type] == TT::COMMA
+              k += 1
+              next
+            elsif (val = literal_value(t))
+              values << val
+              k += 1
+            else
+              # Unparseable literal (macro, identifier, expression). Bail.
+              return [[], op_idx, false]
+            end
+          end
+        when TT::KW_EQ, TT::ASSIGN
+          lit = tokens[op_idx + 1]
+          val = literal_value(lit)
+          return [[], op_idx, false] unless val
+          [[val], op_idx + 2, true]
+        else
+          [[], op_idx, false]
+        end
+      end
+      def literal_value(tok)
+        return nil unless tok
+        case tok[:type]
+        when TT::INTEGER_LITERAL
+          n = Integer(tok[:text]) rescue (return nil)
+          { key: ["int", n], display: tok[:text], line: tok[:start_line], column: tok[:start_column] + 1 }
+        when TT::FLOAT_LITERAL
+          f = Float(tok[:text]) rescue (return nil)
+          # Treat 5.0 as equivalent to 5 for set membership.
+          key = (f == f.to_i) ? ["int", f.to_i] : ["float", f]
+          { key: key, display: tok[:text], line: tok[:start_line], column: tok[:start_column] + 1 }
+        when TT::STRING_LITERAL
+          { key: ["str", tok[:text]], display: tok[:text], line: tok[:start_line], column: tok[:start_column] + 1 }
+        end
+      end
+    end
+  end
+end