sas-linter 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/lib/sas_linter/rules/inconsistent_variable_case.rb +161 -0
- data/lib/sas_linter/version.rb +1 -1
- data/lib/sas_linter.rb +1 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 144b602b9b4eff14c301d9c04852f07f490e9557802f897eb4dd08acbf3d3fa4
|
|
4
|
+
data.tar.gz: 4aa7e953e1ed8a05cd1f4b94cf698086438a529604019120be0bcc0bb0530be3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 74e7303ebdcfcfc616cd6e520f8984494d687b3c0c3af6dcba4efba91f7cbfefd61f511bfa535ae4ee9c4f6a481d4774a96b7a4ff20803ce2768782f2b39f5e6
|
|
7
|
+
data.tar.gz: 7a1dab42267ac076c9d992d7770871c386d52d6f4c6c2cc088372151b6f586e1ff12b963dcab26a47aa0b427396eef989c9f837805a4183e35c43a734b8fbd41
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# sas-linter
|
|
2
2
|
|
|
3
|
-
A configurable lint engine for SAS source files. Built on the [`sas-lexer`](https://github.com/mes-amis/sas-lexer-rb) gem (a Ruby FFI binding to Misha Perlov's Rust [`sas-lexer`](https://github.com/mishamsk/sas-lexer)) and ships with
|
|
3
|
+
A configurable lint engine for SAS source files. Built on the [`sas-lexer`](https://github.com/mes-amis/sas-lexer-rb) gem (a Ruby FFI binding to Misha Perlov's Rust [`sas-lexer`](https://github.com/mishamsk/sas-lexer)) and ships with thirteen pluggable rules covering structural defects, cosmetic issues, and source-header conventions.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -63,6 +63,10 @@ rules:
|
|
|
63
63
|
enabled: true
|
|
64
64
|
autofix: false # rule supports autofix; off by default
|
|
65
65
|
|
|
66
|
+
inconsistent_variable_case:
|
|
67
|
+
enabled: true
|
|
68
|
+
autofix: false # rewrite every minority casing to the most-common form
|
|
69
|
+
|
|
66
70
|
variable_value_out_of_known_range:
|
|
67
71
|
enabled: true
|
|
68
72
|
csv_paths: # empty list = rule is a no-op
|
|
@@ -135,6 +139,7 @@ findings = linter.lint_file("path/to/source.sas")
|
|
|
135
139
|
| `malformed_if_condition` | Empty conditions, missing operators, orphan `then`, unbalanced parens, etc. |
|
|
136
140
|
| `missing_assignment_semicolon` | Assignment statements followed by an inline `**` comment but no terminating `;`. |
|
|
137
141
|
| `variable_value_out_of_known_range` | `if VAR = N` / `if VAR in (...)` literals fall outside the variable's documented acceptable values. Loads the catalog from one or more CSVs with configurable column names and column separator (`,`, `;`, tab). |
|
|
142
|
+
| `inconsistent_variable_case` | Identifier appears with more than one casing in the same file (`myVar` vs `MyVar`). SAS treats both as the same variable; autofix rewrites every minority spelling to the most-common form. Skips proc-format definitions and `format.` / `lib.member` references. |
|
|
138
143
|
|
|
139
144
|
`bin/sas_lint --list-rules` prints the same set with autofix capability.
|
|
140
145
|
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../sas_linter"
|
|
4
|
+
require "sas_lexer"
|
|
5
|
+
|
|
6
|
+
class SasLinter
|
|
7
|
+
module Rules
|
|
8
|
+
# Flag identifiers that are spelled with inconsistent letter case
|
|
9
|
+
# across the file. SAS resolves variable references case-insensitively,
|
|
10
|
+
# so `myVar` and `MyVar` end up bound to the same column — but mixing
|
|
11
|
+
# the two within one program is sloppy and makes the source harder to
|
|
12
|
+
# grep, diff, and read.
|
|
13
|
+
#
|
|
14
|
+
# The most-used spelling wins; every other casing is reported (and
|
|
15
|
+
# rewritten when autofix is on). Ties resolve to the first occurrence
|
|
16
|
+
# so the canonical form is reading-order deterministic.
|
|
17
|
+
#
|
|
18
|
+
# Skipped on purpose:
|
|
19
|
+
# * identifiers immediately followed by `.` (format references like
|
|
20
|
+
# `agecat.`, library references like `work.foo`);
|
|
21
|
+
# * identifiers immediately preceded by `.` (the column half of
|
|
22
|
+
# `lib.member` / `dataset.col`) — those name a column in another
|
|
23
|
+
# dataset, not a variable in the current step;
|
|
24
|
+
# * `value` / `invalue` / `picture` themselves and the format name
|
|
25
|
+
# directly following them — these are proc-format definitions,
|
|
26
|
+
# not variable references. We match locally rather than tracking
|
|
27
|
+
# a `proc format ... run;` block because real-world SAS files
|
|
28
|
+
# meant to be `%include`d into a caller's data step often omit
|
|
29
|
+
# the terminating `run;`, so a state machine would never close.
|
|
30
|
+
class InconsistentVariableCase < Rule
|
|
31
|
+
rule_id :inconsistent_variable_case
|
|
32
|
+
description "Variable identifiers must use one consistent letter case " \
|
|
33
|
+
"across the file; mixing `myVar` and `MyVar` is sloppy."
|
|
34
|
+
severity :warning
|
|
35
|
+
|
|
36
|
+
TT = SasLexer::Lexer::TokenType
|
|
37
|
+
|
|
38
|
+
# Identifiers that introduce a format / informat / picture
|
|
39
|
+
# definition in a `proc format` step. The lexer types these as
|
|
40
|
+
# plain IDENTIFIERs (not keywords), so we recognize them by text.
|
|
41
|
+
FORMAT_DEF_KEYWORDS = %w[value invalue picture].freeze
|
|
42
|
+
|
|
43
|
+
def self.supports_autofix?
|
|
44
|
+
true
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def check(tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
48
|
+
findings = []
|
|
49
|
+
each_inconsistent_use(tokens) do |token, canonical|
|
|
50
|
+
findings << finding(
|
|
51
|
+
line: token[:start_line],
|
|
52
|
+
column: token[:start_column] + 1,
|
|
53
|
+
message: "variable `#{token[:text]}` is spelled `#{canonical}` " \
|
|
54
|
+
"elsewhere in this file — pick one case and stick with it.",
|
|
55
|
+
path: path
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
findings
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def autofix(source)
|
|
62
|
+
return source if source.nil? || source.empty?
|
|
63
|
+
|
|
64
|
+
# If a previous rule's autofix returned ASCII-8BIT (e.g.
|
|
65
|
+
# EncodingIssues#autofix walks bytes and returns binary), tag
|
|
66
|
+
# it UTF-8 before slicing. The lexer treats the bytes as UTF-8
|
|
67
|
+
# and reports character offsets either way; only Ruby's
|
|
68
|
+
# `String#[]=` cares about the encoding label, and it indexes
|
|
69
|
+
# by bytes for ASCII-8BIT but by characters for UTF-8 — so a
|
|
70
|
+
# binary tag plus any multi-byte sequence earlier in the file
|
|
71
|
+
# would shift every replacement by the byte/char gap.
|
|
72
|
+
src = source.encoding == Encoding::UTF_8 ? source : source.dup.force_encoding("UTF-8")
|
|
73
|
+
|
|
74
|
+
lexer = SasLexer::Lexer.new
|
|
75
|
+
begin
|
|
76
|
+
all_tokens = lexer.tokenize(src)
|
|
77
|
+
ensure
|
|
78
|
+
lexer.free
|
|
79
|
+
end
|
|
80
|
+
tokens = all_tokens.reject do |t|
|
|
81
|
+
t[:channel] == SasLexer::Lexer::TokenChannel::HIDDEN ||
|
|
82
|
+
t[:channel] == SasLexer::Lexer::TokenChannel::COMMENT
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
edits = []
|
|
86
|
+
each_inconsistent_use(tokens) do |token, canonical|
|
|
87
|
+
edits << [token[:start], token[:end], canonical]
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Apply right-to-left so earlier offsets stay valid.
|
|
91
|
+
out = src.dup
|
|
92
|
+
edits.sort_by! { |start, _, _| -start }
|
|
93
|
+
edits.each { |start, finish, repl| out[start...finish] = repl }
|
|
94
|
+
out
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
# Yields `[token, canonical_form]` for every identifier whose
|
|
100
|
+
# spelling differs from the file-wide canonical case.
|
|
101
|
+
def each_inconsistent_use(tokens)
|
|
102
|
+
groups = collect_variable_uses(tokens)
|
|
103
|
+
|
|
104
|
+
groups.each_value do |uses|
|
|
105
|
+
forms = uses.map { |t| t[:text] }.tally
|
|
106
|
+
next if forms.size <= 1
|
|
107
|
+
|
|
108
|
+
canonical = canonical_form(forms, uses)
|
|
109
|
+
uses.each do |t|
|
|
110
|
+
yield t, canonical unless t[:text] == canonical
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Walk default-channel tokens and bucket eligible IDENTIFIER
|
|
116
|
+
# uses by lowercase name. Format-related identifiers (see class
|
|
117
|
+
# docstring) are filtered out by `variable_use?`.
|
|
118
|
+
def collect_variable_uses(tokens)
|
|
119
|
+
groups = Hash.new { |h, k| h[k] = [] }
|
|
120
|
+
tokens.each_with_index do |t, i|
|
|
121
|
+
next unless t[:type] == TT::IDENTIFIER && variable_use?(tokens, i)
|
|
122
|
+
|
|
123
|
+
groups[t[:text].downcase] << t
|
|
124
|
+
end
|
|
125
|
+
groups
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Reject `format.` / `lib.member` shapes via byte-adjacency to a
|
|
129
|
+
# `.` token, and `value <fmt-name>` shapes by checking the
|
|
130
|
+
# neighboring identifier. The lexer emits the dot separately, so
|
|
131
|
+
# we use `prev.end == t.start` / `t.end == nxt.start` to tell a
|
|
132
|
+
# truly-adjacent dot from one that just happens to follow after
|
|
133
|
+
# whitespace.
|
|
134
|
+
def variable_use?(tokens, i)
|
|
135
|
+
t = tokens[i]
|
|
136
|
+
nxt = tokens[i + 1]
|
|
137
|
+
prev = i.positive? ? tokens[i - 1] : nil
|
|
138
|
+
|
|
139
|
+
return false if nxt && nxt[:type] == TT::DOT && nxt[:start] == t[:end]
|
|
140
|
+
return false if prev && prev[:type] == TT::DOT && prev[:end] == t[:start]
|
|
141
|
+
return false if FORMAT_DEF_KEYWORDS.include?(t[:text].downcase)
|
|
142
|
+
return false if prev && prev[:type] == TT::IDENTIFIER &&
|
|
143
|
+
FORMAT_DEF_KEYWORDS.include?(prev[:text].downcase)
|
|
144
|
+
|
|
145
|
+
true
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Most-used spelling wins; ties go to the first occurrence so the
|
|
149
|
+
# canonical form matches reading order and stays deterministic
|
|
150
|
+
# across runs.
|
|
151
|
+
def canonical_form(forms, uses)
|
|
152
|
+
max_count = forms.values.max
|
|
153
|
+
winners = forms.select { |_, c| c == max_count }.keys
|
|
154
|
+
return winners.first if winners.size == 1
|
|
155
|
+
|
|
156
|
+
uses.each { |t| return t[:text] if winners.include?(t[:text]) }
|
|
157
|
+
winners.first
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
data/lib/sas_linter/version.rb
CHANGED
data/lib/sas_linter.rb
CHANGED
|
@@ -309,3 +309,4 @@ require_relative "sas_linter/rules/malformed_if_condition"
|
|
|
309
309
|
require_relative "sas_linter/rules/missing_assignment_semicolon"
|
|
310
310
|
require_relative "sas_linter/rules/variable_value_out_of_known_range"
|
|
311
311
|
require_relative "sas_linter/rules/invalid_numeric_literal"
|
|
312
|
+
require_relative "sas_linter/rules/inconsistent_variable_case"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: sas-linter
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Craig McNamara
|
|
@@ -64,6 +64,7 @@ files:
|
|
|
64
64
|
- lib/sas_linter/rules/commented_out_guard.rb
|
|
65
65
|
- lib/sas_linter/rules/encoding_issues.rb
|
|
66
66
|
- lib/sas_linter/rules/identical_if_else_branches.rb
|
|
67
|
+
- lib/sas_linter/rules/inconsistent_variable_case.rb
|
|
67
68
|
- lib/sas_linter/rules/invalid_numeric_literal.rb
|
|
68
69
|
- lib/sas_linter/rules/line_endings.rb
|
|
69
70
|
- lib/sas_linter/rules/malformed_if_condition.rb
|