RubyGems - sas-linter - Versions diffs - 0.1.0 - Mend

sas-linter 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +7 -0
data/LICENSE +661 -0
data/README.md +140 -0
data/Rakefile +11 -0
data/bin/sas_lint +79 -0
data/lib/sas_linter/rules/choose_one_template.rb +61 -0
data/lib/sas_linter/rules/commented_out_guard.rb +59 -0
data/lib/sas_linter/rules/encoding_issues.rb +322 -0
data/lib/sas_linter/rules/identical_if_else_branches.rb +104 -0
data/lib/sas_linter/rules/line_endings.rb +105 -0
data/lib/sas_linter/rules/malformed_if_condition.rb +291 -0
data/lib/sas_linter/rules/missing_assignment_semicolon.rb +141 -0
data/lib/sas_linter/rules/source_headers.rb +290 -0
data/lib/sas_linter/rules/tab_expansion.rb +98 -0
data/lib/sas_linter/rules/trailing_whitespace.rb +53 -0
data/lib/sas_linter/rules/unreachable_inner_branch_value.rb +202 -0
data/lib/sas_linter/rules/variable_value_out_of_known_range.rb +280 -0
data/lib/sas_linter/version.rb +5 -0
data/lib/sas_linter.rb +287 -0
metadata +96 -0

data/lib/sas_linter/rules/identical_if_else_branches.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+require "sas_lexer"
+class SasLinter
+  module Rules
+    # Flag `if COND then S; else S;` where the THEN and ELSE bodies are
+    # identical token-for-token — the condition has no effect on the
+    # outcome, which is almost always a copy-paste error.
+    #
+    # Motivating bug (`docs/AK_LOC_HOME_CARE_SCALE_notes.txt` #1):
+    #
+    #     if iK3 in (6,7,8) then NF1_2=0; else NF1_2=0;
+    #
+    # Both branches assign `NF1_2 = 0`; the THEN should have been `=1`.
+    #
+    # Scope: simple-statement bodies only (`then STMT; else STMT;`). The
+    # block form (`then do; ... end; else do; ... end;`) is ignored — it's
+    # rare and the equivalence check would need to span an unbounded body.
+    class IdenticalIfElseBranches < Rule
+      rule_id :identical_if_else_branches
+      description "`if ... then S; else S;` — THEN and ELSE bodies are " \
+                  "identical, so the condition has no effect."
+      severity :warning
+      TT = SasLexer::Lexer::TokenType
+      def check(tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        findings = []
+        i = 0
+        while i < tokens.length
+          tok = tokens[i]
+          if tok[:type] == TT::KW_THEN
+            # Bail on `then do;` — only handle simple statement bodies.
+            nxt = tokens[i + 1]
+            if nxt && nxt[:type] != TT::KW_DO
+              then_body, after_then = collect_simple_body(tokens, i + 1)
+              if then_body && tokens[after_then] && tokens[after_then][:type] == TT::KW_ELSE
+                else_idx = after_then
+                # Same bail-out for `else do;`.
+                else_first = tokens[else_idx + 1]
+                if else_first && else_first[:type] != TT::KW_DO
+                  else_body, after_else = collect_simple_body(tokens, else_idx + 1)
+                  if else_body && bodies_equivalent?(then_body, else_body)
+                    findings << finding(
+                      line: tokens[else_idx][:start_line],
+                      column: tokens[else_idx][:start_column] + 1,
+                      message: "`if ... then #{render_body(then_body)}; else #{render_body(else_body)};` — " \
+                               "branches are identical; the condition has no effect.",
+                      path: path
+                    )
+                    i = after_else
+                    next
+                  end
+                end
+              end
+            end
+          end
+          i += 1
+        end
+        findings
+      end
+      private
+      # Collect tokens for one statement body starting at `start_idx`, up to
+      # (but not including) the terminating SEMI. Returns [body_tokens,
+      # index_after_semi] or [nil, start_idx] if no SEMI is found before EOF.
+      def collect_simple_body(tokens, start_idx)
+        body = []
+        k = start_idx
+        while k < tokens.length
+          t = tokens[k]
+          return [body, k + 1] if t[:type] == TT::SEMI
+          body << t
+          k += 1
+        end
+        [nil, start_idx]
+      end
+      # Two bodies are equivalent if they have the same token types and the
+      # same normalized text. Identifiers and keywords are SAS-case-insensitive,
+      # so compare downcased text.
+      def bodies_equivalent?(a, b)
+        return false unless a.length == b.length
+        a.each_with_index.all? do |ta, idx|
+          tb = b[idx]
+          ta[:type] == tb[:type] && ta[:text].downcase == tb[:text].downcase
+        end
+      end
+      def render_body(body)
+        body.map { |t| t[:text] }.join(" ").gsub(/\s+([,;()])/, '\1').gsub(/([,(])\s+/, '\1')
+      end
+    end
+  end
+end

data/lib/sas_linter/rules/line_endings.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+class SasLinter
+  module Rules
+    # Flag non-standard line endings in SAS sources. Two patterns
+    # appear in legacy SAS sources and tend to be hand-fixed when
+    # they show up:
+    #
+    #   1. `\r\r\n` — double CR before LF. Word/Outlook copy-paste
+    #      injects an extra CR; SAS Viya tolerates it but downstream
+    #      tools and diffs treat the file as if every line had a
+    #      trailing literal CR character.
+    #
+    #   2. Lone `\r` (CR not followed by LF) — old-Mac CR-only
+    #      endings. SAS Viya treats the entire file as one logical
+    #      line, breaking saspy's shard-based submission flow.
+    #
+    # Autofix collapses `\r\r\n` to `\r\n` unconditionally and maps
+    # every lone `\r` to the file's dominant ending: `\r\n` if the
+    # source has any CRLF (i.e. it's a Windows file with stragglers),
+    # `\n` otherwise (i.e. pure-CR file → POSIX).
+    #
+    # Recognized config options:
+    #   autofix: true | false   (default: false)
+    class LineEndings < Rule
+      rule_id :line_endings
+      description "Source has non-standard line endings (double-CR or lone CR)."
+      severity :warning
+      def self.supports_autofix?
+        true
+      end
+      def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        return [] unless source
+        findings = []
+        bytes = source.b.bytes
+        line = 1
+        col = 1
+        i = 0
+        n = bytes.length
+        while i < n
+          b = bytes[i]
+          if b == 0x0D && bytes[i + 1] == 0x0D && bytes[i + 2] == 0x0A
+            findings << finding(
+              line: line,
+              column: col,
+              message: "double CR before LF (\\r\\r\\n)#{autofix? ? ' (autofixed)' : ''}",
+              path: path
+            )
+            line += 1
+            col = 1
+            i += 3
+          elsif b == 0x0D && bytes[i + 1] == 0x0A
+            line += 1
+            col = 1
+            i += 2
+          elsif b == 0x0D
+            findings << finding(
+              line: line,
+              column: col,
+              message: "lone CR (\\r)#{autofix? ? ' (autofixed)' : ''}",
+              path: path
+            )
+            line += 1
+            col = 1
+            i += 1
+          elsif b == 0x0A
+            line += 1
+            col = 1
+            i += 1
+          else
+            col += 1
+            i += 1
+          end
+        end
+        findings
+      end
+      # Collapse `\r\r\n` to `\r\n`; map every remaining lone `\r` to
+      # the file's dominant terminator (`\r\n` if any CRLF survives,
+      # else `\n`).
+      def autofix(source)
+        # Step 1: remove the duplicate CR in `\r\r\n` sequences. This
+        # leaves at most one `\r` adjacent to `\n` (real CRLF) and
+        # any other `\r` on its own.
+        step1 = source.b.gsub(/\r\r\n/, "\r\n")
+        # Step 2: pick the dominant terminator. `\r\n` wins if there
+        # are any CRLF sequences; otherwise we collapse to LF.
+        dominant_crlf = step1.include?("\r\n")
+        replacement = dominant_crlf ? "\r\n" : "\n"
+        # Step 3: replace every lone `\r` (not followed by `\n`) with
+        # the dominant ending. The negative lookahead leaves real
+        # CRLF intact when CRLF is the dominant style.
+        step1.gsub(/\r(?!\n)/, replacement).force_encoding(source.encoding)
+      end
+    end
+  end
+end

data/lib/sas_linter/rules/malformed_if_condition.rb ADDED Viewed

@@ -0,0 +1,291 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+require "sas_lexer"
+class SasLinter
+  module Rules
+    # Validate that `if ... then` conditions form well-shaped boolean
+    # expressions. Catches authoring mistakes the lexer cheerfully
+    # accepts but that won't run, e.g.
+    #
+    #     if A1 = 1 A2 = 2 then ...    * missing `and`/`or`
+    #     if A1 = 1 and then ...       * trailing operator
+    #     if = 1 then ...               * leading operator, no left operand
+    #     if then ...                   * empty condition
+    #     A1 = 1 then ...              * missing `if`
+    #     if (a = 1 and b = 2 then ...  * unbalanced parens
+    #
+    # Strategy: at each `KW_IF`, walk forward to the matching top-level
+    # `KW_THEN` (or `;` for a subsetting `if`) running a tiny
+    # operand/operator state machine. Top-level only — anything inside
+    # parens is treated as a single sub-expression so function calls
+    # and `in (...)` lists don't trigger false positives.
+    #
+    # An orphan `KW_THEN` (one not consumed by an enclosing `if`) is
+    # reported as a likely missing `if`.
+    class MalformedIfCondition < Rule
+      rule_id :malformed_if_condition
+      description "Validate `if ... then` conditions form a well-shaped " \
+                  "boolean expression (no missing operators, operands, " \
+                  "or `if` keyword; balanced parens)."
+      severity :warning
+      TT = SasLexer::Lexer::TokenType
+      COMPARISON_OPS = [
+        TT::ASSIGN, TT::KW_EQ, TT::KW_NE, TT::NE, TT::KW_LT, TT::LT, TT::KW_LE, TT::LE,
+        TT::KW_GT, TT::GT, TT::KW_GE, TT::GE, TT::KW_IN, TT::SOUNDS_LIKE, TT::GTLT, TT::LTGT,
+        TT::KW_EQT, TT::KW_GTT, TT::KW_LTT, TT::KW_GET, TT::KW_LET, TT::KW_NET
+      ].freeze
+      LOGICAL_OPS = [TT::KW_AND, TT::KW_OR, TT::AMP, TT::PIPE, TT::PIPE2].freeze
+      ARITHMETIC_OPS = [TT::PLUS, TT::MINUS, TT::STAR, TT::FSLASH, TT::STAR2,
+                        TT::EXCL, TT::EXCL2, TT::BPIPE, TT::BPIPE2].freeze
+      BINOPS = (COMPARISON_OPS + LOGICAL_OPS + ARITHMETIC_OPS).to_set.freeze
+      # `+`/`-` are also binary; the state machine disambiguates by
+      # checking whether we currently expect an operand.
+      UNARY_PREFIXES = [TT::KW_NOT, TT::NOT, TT::MINUS, TT::PLUS].to_set.freeze
+      OPERAND_TOKENS = [
+        TT::IDENTIFIER,
+        TT::INTEGER_LITERAL, TT::FLOAT_LITERAL, TT::FLOAT_EXPONENT_LITERAL,
+        TT::STRING_LITERAL, TT::HEX_STRING_LITERAL, TT::BIT_TESTING_LITERAL,
+        TT::DATE_LITERAL, TT::DATE_TIME_LITERAL, TT::TIME_LITERAL, TT::NAME_LITERAL,
+        TT::MACRO_VAR_RESOLVE, TT::MACRO_IDENTIFIER, TT::MACRO_STRING,
+        TT::STRING_EXPR_START
+      ].to_set.freeze
+      def check(tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        findings = []
+        consumed_thens = {}
+        i = 0
+        while i < tokens.length
+          tok = tokens[i]
+          if tok[:type] == TT::KW_IF
+            new_i, sub_findings = analyze_if(tokens, i, path, consumed_thens)
+            findings.concat(sub_findings)
+            i = new_i
+            next
+          end
+          if tok[:type] == TT::KW_THEN && !consumed_thens[i]
+            findings << finding(
+              line: tok[:start_line],
+              column: tok[:start_column] + 1,
+              message: "`then` without a preceding `if` condition — likely missing `if`.",
+              path: path
+            )
+          end
+          i += 1
+        end
+        findings
+      end
+      private
+      # Walk from `if` at `tokens[start]` until the matching `then`
+      # (or `;` for a subsetting `if`), validating expression shape.
+      # Returns [next_i, findings].
+      #
+      # Emits at most ONE finding per `if`: one structural defect (e.g.
+      # `iK2g in 0,1)` — missing `(` after `in`) cascades through the
+      # state machine into adjacent unbalanced-paren / orphan-then
+      # errors. After the first finding, we set `broken` and walk
+      # forward to the next top-level `;`, marking any `KW_THEN`
+      # tokens as consumed so the outer loop's orphan-then detector
+      # doesn't double-fire on this same broken statement.
+      # Mutates `consumed_thens`.
+      def analyze_if(tokens, start, path, consumed_thens)
+        findings = []
+        state = :expect_operand
+        paren_depth = 0
+        open_parens = []
+        cond_started = false
+        last_op_tok = nil
+        broken = false
+        add_finding = lambda do |tok, message|
+          findings << finding(
+            line: tok[:start_line], column: tok[:start_column] + 1,
+            message: message, path: path
+          )
+          broken = true
+        end
+        i = start + 1
+        while i < tokens.length
+          t = tokens[i]
+          type = t[:type]
+          # Recovery mode: skip ahead to the next `;`, marking any
+          # `then`s we pass over as consumed so they don't flag as
+          # orphan in the outer loop. Paren state is intentionally
+          # ignored — once we've emitted a finding we don't trust
+          # the depth counter to be meaningful.
+          if broken
+            consumed_thens[i] = true if type == TT::KW_THEN
+            return [i + 1, findings] if type == TT::SEMI
+            i += 1
+            next
+          end
+          if type == TT::KW_THEN && paren_depth.zero?
+            flag_terminal(findings, path, state, cond_started, last_op_tok, t, "then")
+            consumed_thens[i] = true
+            return [i + 1, findings]
+          end
+          if type == TT::SEMI && paren_depth.zero?
+            flag_terminal(findings, path, state, cond_started, last_op_tok, t, "subsetting `if`")
+            return [i + 1, findings]
+          end
+          # `then` / `;` inside open parens means a paren never closed.
+          # Flag at the offending `(` and drop into recovery mode.
+          if (type == TT::KW_THEN || type == TT::SEMI) && paren_depth.positive?
+            lp = open_parens.first
+            add_finding.call(lp, "unbalanced `(` in `if` condition (no matching `)` " \
+                                 "before `#{t[:text]}`).")
+            consumed_thens[i] = true if type == TT::KW_THEN
+            i += 1
+            next
+          end
+          if type == TT::LPAREN || type == TT::LBRACK
+            cond_started = true
+            paren_depth += 1
+            open_parens.push(t)
+            i += 1
+            next
+          end
+          if type == TT::RPAREN || type == TT::RBRACK
+            if paren_depth.zero?
+              add_finding.call(t, "unbalanced `#{t[:text]}` in `if` condition.")
+              i += 1
+              next
+            end
+            paren_depth -= 1
+            open_parens.pop
+            # A parenthesized sub-expression, function-call arg list,
+            # or array subscript that just closed counts as one
+            # completed operand.
+            state = :expect_operator if paren_depth.zero?
+            i += 1
+            next
+          end
+          # Inside parens we don't validate — the whole `(...)` is one
+          # atom at the top level.
+          if paren_depth.positive?
+            i += 1
+            next
+          end
+          # `,` at top level only appears inside `in (...)`, which is
+          # paren-wrapped. Treat as a no-op if it leaks through.
+          if type == TT::COMMA
+            i += 1
+            next
+          end
+          cond_started = true
+          if state == :expect_operand
+            if UNARY_PREFIXES.include?(type)
+              i += 1
+              next
+            end
+            if OPERAND_TOKENS.include?(type)
+              state = :expect_operator
+              i += 1
+              next
+            end
+            if BINOPS.include?(type)
+              msg = if last_op_tok.nil?
+                      "operator `#{t[:text]}` at start of `if` condition with no left operand."
+                    else
+                      "operator `#{t[:text]}` follows operator `#{last_op_tok[:text]}` " \
+                        "with no operand between them."
+                    end
+              add_finding.call(t, msg)
+              last_op_tok = t
+              i += 1
+              next
+            end
+            # Unknown token in operand position — treat opaquely as
+            # one operand to keep walking. Reduces false positives on
+            # SAS shapes we don't fully model (e.g. DOT for missing
+            # values, format references).
+            state = :expect_operator
+            i += 1
+            next
+          end
+          # state == :expect_operator
+          if BINOPS.include?(type)
+            last_op_tok = t
+            state = :expect_operand
+            i += 1
+            next
+          end
+          # Negated comparisons: `not eq`, `not in`, `not lt`, `^=`,
+          # `^in`, `^<`, etc. The lexer splits these into a NOT/`^`
+          # token and a comparison op; recognize the pair as one
+          # binary operator so the state machine doesn't see two
+          # operators in a row.
+          if (type == TT::KW_NOT || type == TT::NOT) && (nxt = tokens[i + 1]) &&
+             COMPARISON_OPS.include?(nxt[:type])
+            last_op_tok = nxt
+            state = :expect_operand
+            i += 2
+            next
+          end
+          if OPERAND_TOKENS.include?(type) || UNARY_PREFIXES.include?(type)
+            add_finding.call(t,
+                             "missing operator before `#{t[:text]}` in `if` condition — " \
+                             "perhaps a missing `and`/`or`?")
+            i += 1
+            next
+          end
+          i += 1
+        end
+        # Reached EOF without seeing `then` or `;`.
+        [i, findings]
+      end
+      def flag_terminal(findings, path, state, cond_started, last_op_tok, terminator, where)
+        if !cond_started
+          findings << finding(
+            line: terminator[:start_line], column: terminator[:start_column] + 1,
+            message: "`if #{where}` with empty condition.",
+            path: path
+          )
+        elsif state == :expect_operand && last_op_tok
+          findings << finding(
+            line: last_op_tok[:start_line], column: last_op_tok[:start_column] + 1,
+            message: "operator `#{last_op_tok[:text]}` has no right operand before " \
+                     "`#{terminator[:text]}`.",
+            path: path
+          )
+        end
+      end
+    end
+  end
+end

data/lib/sas_linter/rules/missing_assignment_semicolon.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require_relative "../../sas_linter"
+require "sas_lexer"
+class SasLinter
+  module Rules
+    # Flag assignment statements whose terminating `;` is missing,
+    # causing the inline `**`-style comment marker to be lexed as
+    # the SAS exponentiation operator and absorbed into the RHS
+    # expression.
+    #
+    # The motivating bug is in MDS2.0_CAP_FEEDTB_G2_V2.1_P_2012-03-15.txt:
+    #
+    #     B1 = B1     **  Comatose;        ← missing `;` before `**`
+    #     B4 = B4;    **  Daily decision-making;
+    #     K5b = K5b;  **  Tube feeding;
+    #
+    # SAS lexes the first line as `B1 := B1 ^ Comatose`, where
+    # `Comatose` is an undefined variable — the assignment silently
+    # produces a missing value at runtime instead of the identity
+    # mapping the author intended.
+    #
+    # Detection: a STAR2 (`**`) token where
+    #   * the line containing it does NOT start with `**` (which
+    #     would put us in a header / boxed-comment context, where
+    #     `**` is part of the comment-statement opener, not an
+    #     operator); AND
+    #   * the previous default-channel token is an IDENTIFIER (the
+    #     RHS variable in the assignment); AND
+    #   * the next default-channel token is an IDENTIFIER (the prose
+    #     start of what should have been an inline comment).
+    #
+    # Legitimate `X = Y ** 2` exponentiation has a numeric literal
+    # after the `**`, not an identifier, so it doesn't match.
+    class MissingAssignmentSemicolon < Rule
+      rule_id :missing_assignment_semicolon
+      description "Assignment missing terminating `;` — the inline " \
+                  "`**` comment marker was lexed as exponentiation."
+      severity :warning
+      TT = SasLexer::Lexer::TokenType
+      def self.supports_autofix?
+        true
+      end
+      def check(tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
+        findings = []
+        lines = (source || "").split("\n", -1)
+        tokens.each_with_index do |t, i|
+          next unless t[:type] == TT::STAR2
+          # Skip header / boxed-comment lines: `** PROGRAM: ... **;`,
+          # `**  DATA STEP STARTS HERE  **;`, etc. The `**` there is
+          # part of the comment shape, not an operator.
+          line = lines[t[:start_line] - 1]
+          next if line.nil? || line.lstrip.start_with?("**")
+          prev_t = tokens[i - 1] if i.positive?
+          next_t = tokens[i + 1]
+          next unless prev_t && next_t
+          next unless prev_t[:type] == TT::IDENTIFIER && next_t[:type] == TT::IDENTIFIER
+          findings << finding(
+            line: t[:start_line],
+            column: t[:start_column] + 1,
+            message: "`**` parsed as exponentiation in `#{prev_t[:text]} ** #{next_t[:text]}` — " \
+                     "looks like a missing `;` before an inline `** ... ;` comment.",
+            path: path
+          )
+        end
+        findings
+      end
+      # Insert the missing `;` immediately after the RHS identifier on
+      # each flaggable line. By replacing the single space that
+      # already sits between the identifier and the `**`, we preserve
+      # the existing column alignment of the inline-comment block —
+      # the row goes from `B1 = B1     **  ...;` to
+      # `B1 = B1;    **  ...;`, matching the canonical SAS
+      # `VAR = VAR;   ** description;` shape.
+      def autofix(source)
+        return source if source.nil? || source.empty?
+        lexer = SasLexer::Lexer.new
+        begin
+          all_tokens = lexer.tokenize(source)
+        ensure
+          lexer.free
+        end
+        tokens = all_tokens.reject do |t|
+          t[:channel] == SasLexer::Lexer::TokenChannel::HIDDEN ||
+            t[:channel] == SasLexer::Lexer::TokenChannel::COMMENT
+        end
+        source_lines = source.split("\n", -1)
+        edits = []
+        tokens.each_with_index do |t, i|
+          next unless t[:type] == TT::STAR2
+          line = source_lines[t[:start_line] - 1]
+          next if line.nil? || line.lstrip.start_with?("**")
+          prev_t = tokens[i - 1] if i.positive?
+          next_t = tokens[i + 1]
+          next unless prev_t && next_t
+          next unless prev_t[:type] == TT::IDENTIFIER && next_t[:type] == TT::IDENTIFIER
+          edits << [t[:start_line] - 1, prev_t[:end_column]]
+        end
+        edits.each do |line_idx, col|
+          line = source_lines[line_idx]
+          replacement =
+            if col + 1 < line.length && line[col] == " " && line[col + 1] == " "
+              # Two or more spaces between IDENT and `**`: consume one
+              # for the `;` so existing column alignment of the inline
+              # `**` comment is preserved (`B1 = B1     **` becomes
+              # `B1 = B1;    **`).
+              ";#{line[(col + 1)..]}"
+            elsif col < line.length && line[col] == " "
+              # Exactly one space: keep it after the `;` so we don't
+              # produce `iA16a;**` (functional but ugly) — `; **` is
+              # the canonical neighbor shape.
+              "; #{line[(col + 1)..]}"
+            else
+              # No space at all (rare). Inject `; `.
+              "; #{line[col..]}"
+            end
+          source_lines[line_idx] = "#{line[0...col]}#{replacement}"
+        end
+        source_lines.join("\n")
+      end
+    end
+  end
+end