sas-linter 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +661 -0
- data/README.md +140 -0
- data/Rakefile +11 -0
- data/bin/sas_lint +79 -0
- data/lib/sas_linter/rules/choose_one_template.rb +61 -0
- data/lib/sas_linter/rules/commented_out_guard.rb +59 -0
- data/lib/sas_linter/rules/encoding_issues.rb +322 -0
- data/lib/sas_linter/rules/identical_if_else_branches.rb +104 -0
- data/lib/sas_linter/rules/line_endings.rb +105 -0
- data/lib/sas_linter/rules/malformed_if_condition.rb +291 -0
- data/lib/sas_linter/rules/missing_assignment_semicolon.rb +141 -0
- data/lib/sas_linter/rules/source_headers.rb +290 -0
- data/lib/sas_linter/rules/tab_expansion.rb +98 -0
- data/lib/sas_linter/rules/trailing_whitespace.rb +53 -0
- data/lib/sas_linter/rules/unreachable_inner_branch_value.rb +202 -0
- data/lib/sas_linter/rules/variable_value_out_of_known_range.rb +280 -0
- data/lib/sas_linter/version.rb +5 -0
- data/lib/sas_linter.rb +287 -0
- metadata +96 -0
data/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# sas-linter
|
|
2
|
+
|
|
3
|
+
A configurable lint engine for SAS source files. Built on the [`sas-lexer`](https://github.com/mes-amis/sas-lexer-rb) gem (a Ruby FFI binding to Misha Perlov's Rust [`sas-lexer`](https://github.com/mishamsk/sas-lexer)) and ships with eleven pluggable rules covering structural defects, cosmetic issues, and source-header conventions.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add to your Gemfile:
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
gem "sas-linter"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or install directly:
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
gem install sas-linter
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## CLI usage
|
|
20
|
+
|
|
21
|
+
```sh
|
|
22
|
+
# Run every rule on a single file
|
|
23
|
+
bin/sas_lint path/to/source.sas
|
|
24
|
+
|
|
25
|
+
# List all registered rules with their description and autofix capability
|
|
26
|
+
bin/sas_lint --list-rules
|
|
27
|
+
|
|
28
|
+
# Run only specific rules
|
|
29
|
+
bin/sas_lint --rules malformed_if_condition,identical_if_else_branches src/*.sas
|
|
30
|
+
|
|
31
|
+
# Use a YAML config (default: config/lint.yaml)
|
|
32
|
+
bin/sas_lint --config my-lint.yaml src/*.sas
|
|
33
|
+
|
|
34
|
+
# Lint without applying any autofixes the config requested
|
|
35
|
+
bin/sas_lint --no-autofix src/*.sas
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Exit codes: `0` clean, `1` findings, `2` invalid args.
|
|
39
|
+
|
|
40
|
+
## Library usage
|
|
41
|
+
|
|
42
|
+
```ruby
|
|
43
|
+
require "sas_linter"
|
|
44
|
+
|
|
45
|
+
linter = SasLinter.new # all registered rules
|
|
46
|
+
linter = SasLinter.new(rules: [:malformed_if_condition]) # subset by rule id
|
|
47
|
+
linter = SasLinter.from_config(YAML.load_file("lint.yaml"))
|
|
48
|
+
|
|
49
|
+
findings = linter.lint(source_string, path: "demo.sas")
|
|
50
|
+
findings.each { |f| puts f.to_s } # path:line:col: [rule_id] message
|
|
51
|
+
|
|
52
|
+
# Lint a file. If any rule has autofix enabled and changed the source,
|
|
53
|
+
# the file is rewritten in place.
|
|
54
|
+
findings = linter.lint_file("path/to/source.sas")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Built-in rules
|
|
58
|
+
|
|
59
|
+
| rule id | description |
|
|
60
|
+
|---|---|
|
|
61
|
+
| `unreachable_inner_branch_value` | Outer `if VAR in (S) then do;` guards an inner branch whose comparison values aren't all in `S`. |
|
|
62
|
+
| `identical_if_else_branches` | `if COND then S; else S;` with identical bodies — almost always a copy-paste error. |
|
|
63
|
+
| `commented_out_guard` | SAS line-comment `* if ... then do;` pattern indicating a disabled outer validity guard. |
|
|
64
|
+
| `choose_one_template` | `** CHOOSE ONE OF THE BELOW STATEMENTS;` banner indicating a broken-by-default source. |
|
|
65
|
+
| `trailing_whitespace` | Trailing spaces/tabs at end of line. |
|
|
66
|
+
| `tab_expansion` | Tab characters that should be spaces (configurable width). |
|
|
67
|
+
| `source_headers` | Restore the `**...**;` 90-char header convention to broken sources. |
|
|
68
|
+
| `line_endings` | Mixed or non-CRLF line terminators (configurable target). |
|
|
69
|
+
| `encoding_issues` | Smart-quote / em-dash / Win-1252 byte sequences that confuse downstream tooling. |
|
|
70
|
+
| `malformed_if_condition` | Empty conditions, missing operators, orphan `then`, unbalanced parens, etc. |
|
|
71
|
+
| `missing_assignment_semicolon` | Assignment statements followed by an inline `**` comment but no terminating `;`. |
|
|
72
|
+
| `variable_value_out_of_known_range` | `if VAR = N` / `if VAR in (...)` literals fall outside the variable's documented acceptable values. Loads the catalog from one or more CSVs with configurable column names and column separator (`,`, `;`, tab). |
|
|
73
|
+
|
|
74
|
+
`bin/sas_lint --list-rules` prints the same set with autofix capability.
|
|
75
|
+
|
|
76
|
+
## Writing a custom rule
|
|
77
|
+
|
|
78
|
+
Subclass `SasLinter::Rule`, declare an id, description, and severity, then implement `#check`:
|
|
79
|
+
|
|
80
|
+
```ruby
|
|
81
|
+
class MyRule < SasLinter::Rule
|
|
82
|
+
rule_id :my_rule
|
|
83
|
+
description "Flag occurrences of FOO in DATA steps."
|
|
84
|
+
severity :warning
|
|
85
|
+
|
|
86
|
+
def check(tokens, path:, all_tokens: nil, source: nil)
|
|
87
|
+
findings = []
|
|
88
|
+
tokens.each do |t|
|
|
89
|
+
next unless t[:text] == "FOO"
|
|
90
|
+
findings << finding(line: t[:start_line], column: t[:start_column],
|
|
91
|
+
message: "FOO is forbidden", path: path)
|
|
92
|
+
end
|
|
93
|
+
findings
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Subclasses self-register on the rule registry via `rule_id` — once required, they're picked up by `SasLinter.new` (no rule list) and resolvable via `SasLinter::Rule.fetch(:my_rule)`.
|
|
99
|
+
|
|
100
|
+
To support autofix, override `self.supports_autofix?` to return `true` and implement `#autofix(source)` to return the rewritten source.
|
|
101
|
+
|
|
102
|
+
## YAML config
|
|
103
|
+
|
|
104
|
+
```yaml
|
|
105
|
+
rules:
|
|
106
|
+
malformed_if_condition:
|
|
107
|
+
enabled: true # default
|
|
108
|
+
trailing_whitespace:
|
|
109
|
+
enabled: true
|
|
110
|
+
autofix: true
|
|
111
|
+
encoding_issues:
|
|
112
|
+
enabled: true
|
|
113
|
+
use_defaults: true
|
|
114
|
+
replacements:
|
|
115
|
+
"—": "--"
|
|
116
|
+
identical_if_else_branches:
|
|
117
|
+
enabled: false # disable a rule
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Rules omitted from the config default to enabled with no options, so adding a new rule to the gem won't silently disable it for users with existing configs.
|
|
121
|
+
|
|
122
|
+
## Testing
|
|
123
|
+
|
|
124
|
+
```sh
|
|
125
|
+
bundle install
|
|
126
|
+
bundle exec rake spec
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
[GNU Affero General Public License v3.0 or later](LICENSE) — chosen to match the upstream `sas-lexer` gem (which `sas-linter` requires at runtime). © Mon Ami, Inc.
|
|
132
|
+
|
|
133
|
+
Practical implications:
|
|
134
|
+
|
|
135
|
+
- **Internal / personal use** has no obligations beyond preserving notices.
|
|
136
|
+
- **Redistribution** (shipping the gem inside a binary, container image, or product) requires offering the complete corresponding source under AGPL-3.0.
|
|
137
|
+
- **Network use** (running `sas-linter` as a backend that users interact with remotely) triggers the AGPL's source-disclosure clause for those network users.
|
|
138
|
+
- **Combined works** with `sas-linter` must be licensed under AGPL-compatible terms.
|
|
139
|
+
|
|
140
|
+
If those terms don't fit your use case, run a standalone lint job (CLI / CI step) instead of embedding the linter in a redistributed product.
|
data/Rakefile
ADDED
data/bin/sas_lint
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "sas_linter"
|
|
5
|
+
require "optparse"
|
|
6
|
+
|
|
7
|
+
options = { config: SasLinter::DEFAULT_CONFIG_PATH, rules: nil, autofix: true }
|
|
8
|
+
parser = OptionParser.new do |opts|
|
|
9
|
+
opts.banner = "Usage: sas_lint FILE [FILE ...] [options]"
|
|
10
|
+
|
|
11
|
+
opts.on("--config PATH", "YAML config file (default: #{SasLinter::DEFAULT_CONFIG_PATH}). " \
|
|
12
|
+
"Missing file → run with built-in defaults.") do |path|
|
|
13
|
+
options[:config] = path
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
opts.on("--rules id1,id2,...", Array,
|
|
17
|
+
"Override config and run only the listed rules with default options.") do |ids|
|
|
18
|
+
options[:rules] = ids.map(&:to_sym)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.on("--no-autofix",
|
|
22
|
+
"Suppress autofix even if the config sets `autofix: true` for some rule. " \
|
|
23
|
+
"Findings are still reported but no file is rewritten.") do
|
|
24
|
+
options[:autofix] = false
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
opts.on("--list-rules", "Print every registered rule and exit") do
|
|
28
|
+
SasLinter::Rule.all.each do |klass|
|
|
29
|
+
mark = klass.supports_autofix? ? " [autofix]" : ""
|
|
30
|
+
puts format("%-40s %s%s", klass.rule_id, klass.description, mark)
|
|
31
|
+
end
|
|
32
|
+
exit 0
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
opts.on("-h", "--help", "Show this help message") do
|
|
36
|
+
puts opts
|
|
37
|
+
exit 0
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
parser.parse!
|
|
42
|
+
|
|
43
|
+
if ARGV.empty?
|
|
44
|
+
warn parser.help
|
|
45
|
+
exit 2
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
linter =
|
|
49
|
+
if options[:rules]
|
|
50
|
+
SasLinter.new(rules: options[:rules])
|
|
51
|
+
else
|
|
52
|
+
config = SasLinter.load_config_file(options[:config])
|
|
53
|
+
# `--no-autofix` strips every rule's autofix flag from the loaded config
|
|
54
|
+
# before the linter is built, so a dry run can never rewrite a file.
|
|
55
|
+
if options[:autofix] == false && config.is_a?(Hash) && config["rules"].is_a?(Hash)
|
|
56
|
+
config["rules"].each_value do |opts_hash|
|
|
57
|
+
opts_hash["autofix"] = false if opts_hash.is_a?(Hash) && opts_hash["autofix"]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
SasLinter.from_config(config)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
exit_code = 0
|
|
64
|
+
|
|
65
|
+
ARGV.each do |path|
|
|
66
|
+
unless File.file?(path)
|
|
67
|
+
warn "sas_lint: #{path}: not a regular file"
|
|
68
|
+
exit_code = 2
|
|
69
|
+
next
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
findings = linter.lint_file(path)
|
|
73
|
+
next if findings.empty?
|
|
74
|
+
|
|
75
|
+
exit_code = 1
|
|
76
|
+
findings.each { |f| puts f.to_s }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
exit exit_code
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../sas_linter"
|
|
4
|
+
require "sas_lexer"
|
|
5
|
+
|
|
6
|
+
class SasLinter
|
|
7
|
+
module Rules
|
|
8
|
+
# Flag SAS sources that ship with the "CHOOSE ONE OF THE BELOW STATEMENTS"
|
|
9
|
+
# banner. The banner introduces a block of mutually-exclusive validity
|
|
10
|
+
# guards (typically `[USE FOR HC OR CHA WITH FS]`, `[USE FOR LTCF]`,
|
|
11
|
+
# `[USE FOR CHA WITHOUT FS]`, etc.), all commented out, and asks the
|
|
12
|
+
# downstream consumer to pick one before the source will work.
|
|
13
|
+
#
|
|
14
|
+
# Why it's an antipattern:
|
|
15
|
+
# - The source is broken-by-default — every consumer must mutate it
|
|
16
|
+
# before use.
|
|
17
|
+
# - SAS won't error on the dangling `end;` of the (also-commented)
|
|
18
|
+
# block, but the algorithm runs unguarded if no variant is picked.
|
|
19
|
+
# - The deployment-context decision should belong to a config file or
|
|
20
|
+
# a separate per-context source variant, not to a comment-toggle
|
|
21
|
+
# buried in the middle of the algorithm.
|
|
22
|
+
#
|
|
23
|
+
# Companion rule: `commented_out_guard` flags the individual disabled
|
|
24
|
+
# guards. This rule flags the banner that introduces them, so we can
|
|
25
|
+
# find every file that ships in the multi-template state regardless
|
|
26
|
+
# of whether any variant has already been activated.
|
|
27
|
+
class ChooseOneTemplate < Rule
|
|
28
|
+
rule_id :choose_one_template
|
|
29
|
+
description "Source ships with a 'CHOOSE ONE OF THE BELOW STATEMENTS' " \
|
|
30
|
+
"banner — broken-by-default; consumers must mutate the " \
|
|
31
|
+
"source to pick a deployment-context guard."
|
|
32
|
+
severity :warning
|
|
33
|
+
|
|
34
|
+
TT = SasLexer::Lexer::TokenType
|
|
35
|
+
TC = SasLexer::Lexer::TokenChannel
|
|
36
|
+
|
|
37
|
+
BANNER = /CHOOSE\s+ONE\s+OF\s+THE\s+BELOW\s+STATEMENTS/i
|
|
38
|
+
|
|
39
|
+
def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
40
|
+
return [] unless all_tokens
|
|
41
|
+
|
|
42
|
+
all_tokens.filter_map do |tok|
|
|
43
|
+
next unless tok[:channel] == TC::COMMENT
|
|
44
|
+
next unless tok[:type] == TT::COMMENT_STAT
|
|
45
|
+
next unless tok[:text] =~ BANNER
|
|
46
|
+
|
|
47
|
+
finding(
|
|
48
|
+
line: tok[:start_line],
|
|
49
|
+
column: tok[:start_column] + 1,
|
|
50
|
+
message: "'CHOOSE ONE OF THE BELOW STATEMENTS' banner — source is " \
|
|
51
|
+
"broken-by-default; the alternative validity guards below " \
|
|
52
|
+
"are all commented out so every consumer must edit this " \
|
|
53
|
+
"file. Pick one variant, delete the others, and remove " \
|
|
54
|
+
"this banner.",
|
|
55
|
+
path: path
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../sas_linter"
|
|
4
|
+
require "sas_lexer"
|
|
5
|
+
|
|
6
|
+
class SasLinter
|
|
7
|
+
module Rules
|
|
8
|
+
# Flag SAS line-comments (`* ... ;`) whose body looks like a disabled
|
|
9
|
+
# validity guard — specifically, the body contains both an `IF` and a
|
|
10
|
+
# `THEN DO` (case-insensitive).
|
|
11
|
+
#
|
|
12
|
+
# Motivating shape: a source's outer `if ... then do;` validity guard
|
|
13
|
+
# is commented out by a leading `*`, leaving an orphan `end;` further
|
|
14
|
+
# down. The body then runs unguarded for inputs the guard would have
|
|
15
|
+
# rejected. Worth a human review on each finding — either the guard
|
|
16
|
+
# should be live, or the orphan `end;` should be removed.
|
|
17
|
+
class CommentedOutGuard < Rule
|
|
18
|
+
rule_id :commented_out_guard
|
|
19
|
+
description "SAS `* ... ;` line comment looks like a disabled `if " \
|
|
20
|
+
"... then do` validity guard — review and either restore " \
|
|
21
|
+
"the guard or remove the orphan `end;`."
|
|
22
|
+
severity :warning
|
|
23
|
+
|
|
24
|
+
TT = SasLexer::Lexer::TokenType
|
|
25
|
+
TC = SasLexer::Lexer::TokenChannel
|
|
26
|
+
|
|
27
|
+
# Match `if ... then do` anywhere in the comment body, case-insensitive.
|
|
28
|
+
# Look for `then` followed (after whitespace and possibly more tokens)
|
|
29
|
+
# by `do` — the SAS authoring style where the guard expression is
|
|
30
|
+
# spread across multiple lines.
|
|
31
|
+
GUARD_PATTERN = /\bif\b.*\bthen\b\s+do\b/im
|
|
32
|
+
|
|
33
|
+
def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
34
|
+
return [] unless all_tokens
|
|
35
|
+
|
|
36
|
+
all_tokens.filter_map do |tok|
|
|
37
|
+
next unless tok[:channel] == TC::COMMENT
|
|
38
|
+
next unless tok[:type] == TT::COMMENT_STAT
|
|
39
|
+
|
|
40
|
+
body = tok[:text]
|
|
41
|
+
# Only flag SAS statement-comments that start with `*` (not `**`),
|
|
42
|
+
# since `** ... **;` is a header comment style and `* ...;` is
|
|
43
|
+
# the disable-this-statement style.
|
|
44
|
+
next unless body =~ /\A\s*\*(?!\*)/
|
|
45
|
+
next unless body =~ GUARD_PATTERN
|
|
46
|
+
|
|
47
|
+
finding(
|
|
48
|
+
line: tok[:start_line],
|
|
49
|
+
column: tok[:start_column] + 1,
|
|
50
|
+
message: "looks like a disabled validity guard (`* if ... then do; ...`); " \
|
|
51
|
+
"review whether the guard should be live or whether the matching " \
|
|
52
|
+
"`end;` is now orphaned.",
|
|
53
|
+
path: path
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../sas_linter"
|
|
4
|
+
|
|
5
|
+
class SasLinter
|
|
6
|
+
module Rules
|
|
7
|
+
# Flag and (optionally) rewrite encoding issues in SAS source —
|
|
8
|
+
# smart quotes (`‘ ’ “ ”`), en/em dashes (`– —`), ellipsis (`…`),
|
|
9
|
+
# non-break space, U+2000–U+200A typographic spaces, line/para
|
|
10
|
+
# separators, and the Windows-1252 single-byte forms of the same
|
|
11
|
+
# characters (0x91/0x92, 0x93/0x94, 0x96/0x97, 0xA0, …) that
|
|
12
|
+
# bypass UTF-8 transcoding when source files arrive from Word /
|
|
13
|
+
# Outlook / a legacy Latin-1 round-trip.
|
|
14
|
+
#
|
|
15
|
+
# The rule has two layers, both off by default:
|
|
16
|
+
#
|
|
17
|
+
# 1. `use_defaults: true` enables the canonical fix
|
|
18
|
+
# table — UTF8_REPLACEMENTS (multibyte UTF-8 smart
|
|
19
|
+
# punctuation) plus BYTE_REPLACEMENTS (single Windows-1252
|
|
20
|
+
# bytes the lexer can't make sense of). The fixer walks the
|
|
21
|
+
# source as a byte stream and only touches a Win-1252 byte
|
|
22
|
+
# when it's not already part of a valid UTF-8 sequence — so
|
|
23
|
+
# names like `MÖLLER` (`\xC3\x96`) survive intact.
|
|
24
|
+
#
|
|
25
|
+
# 2. `replacements: { from => to }` adds project-specific
|
|
26
|
+
# string-level substitutions. Use this for site-local
|
|
27
|
+
# cleanups, stylistic preferences (em-dash → "--" instead
|
|
28
|
+
# of "-"), to override default behavior on specific bytes,
|
|
29
|
+
# or to add extra characters not in the default table.
|
|
30
|
+
#
|
|
31
|
+
# Findings carry a line:column position for every match. Autofix
|
|
32
|
+
# runs the user `replacements:` map FIRST and the canonical
|
|
33
|
+
# fixer SECOND — so a project-specific pattern can target byte
|
|
34
|
+
# sequences the canonical defaults would otherwise consume
|
|
35
|
+
# (e.g. catch a multi-byte ellipsis in a specific surname before
|
|
36
|
+
# the default `… → ...` rewrite hits it).
|
|
37
|
+
#
|
|
38
|
+
# Recognized config options:
|
|
39
|
+
# use_defaults: true | false (default: false)
|
|
40
|
+
# replacements: { String => String } (default: {})
|
|
41
|
+
# autofix: true | false (default: false)
|
|
42
|
+
class EncodingIssues < Rule
|
|
43
|
+
rule_id :encoding_issues
|
|
44
|
+
description "Source contains smart-punctuation / Win-1252 byte sequences."
|
|
45
|
+
severity :warning
|
|
46
|
+
|
|
47
|
+
# Map of UTF-8 (multi-byte) byte sequences to their ASCII
|
|
48
|
+
# replacement. Stored as raw byte strings so substitution
|
|
49
|
+
# doesn't depend on the encoding state of the source.
|
|
50
|
+
UTF8_REPLACEMENTS = {
|
|
51
|
+
"\xE2\x80\x98".b => "'", # U+2018 LEFT SINGLE QUOTATION MARK
|
|
52
|
+
"\xE2\x80\x99".b => "'", # U+2019 RIGHT SINGLE QUOTATION MARK
|
|
53
|
+
"\xE2\x80\x9A".b => "'", # U+201A SINGLE LOW-9 QUOTATION MARK
|
|
54
|
+
"\xE2\x80\x9B".b => "'", # U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
|
55
|
+
"\xE2\x80\x9C".b => '"', # U+201C LEFT DOUBLE QUOTATION MARK
|
|
56
|
+
"\xE2\x80\x9D".b => '"', # U+201D RIGHT DOUBLE QUOTATION MARK
|
|
57
|
+
"\xE2\x80\x9E".b => '"', # U+201E DOUBLE LOW-9 QUOTATION MARK
|
|
58
|
+
"\xE2\x80\x93".b => "-", # U+2013 EN DASH
|
|
59
|
+
"\xE2\x80\x94".b => "-", # U+2014 EM DASH
|
|
60
|
+
"\xE2\x80\x95".b => "-", # U+2015 HORIZONTAL BAR
|
|
61
|
+
"\xE2\x80\xA6".b => "...", # U+2026 HORIZONTAL ELLIPSIS
|
|
62
|
+
"\xC2\xA0".b => " ", # U+00A0 NO-BREAK SPACE
|
|
63
|
+
# Typographic spaces in the U+2000–U+200A range plus the line/para
|
|
64
|
+
# separators. Word docs sprinkle these in liberally and SAS chokes
|
|
65
|
+
# with `ERROR 217-322: Invalid statement due to first character
|
|
66
|
+
# being unprintable`.
|
|
67
|
+
"\xE2\x80\x80".b => " ", # U+2000 EN QUAD
|
|
68
|
+
"\xE2\x80\x81".b => " ", # U+2001 EM QUAD
|
|
69
|
+
"\xE2\x80\x82".b => " ", # U+2002 EN SPACE
|
|
70
|
+
"\xE2\x80\x83".b => " ", # U+2003 EM SPACE
|
|
71
|
+
"\xE2\x80\x84".b => " ", # U+2004 THREE-PER-EM SPACE
|
|
72
|
+
"\xE2\x80\x85".b => " ", # U+2005 FOUR-PER-EM SPACE
|
|
73
|
+
"\xE2\x80\x86".b => " ", # U+2006 SIX-PER-EM SPACE
|
|
74
|
+
"\xE2\x80\x87".b => " ", # U+2007 FIGURE SPACE
|
|
75
|
+
"\xE2\x80\x88".b => " ", # U+2008 PUNCTUATION SPACE
|
|
76
|
+
"\xE2\x80\x89".b => " ", # U+2009 THIN SPACE
|
|
77
|
+
"\xE2\x80\x8A".b => " ", # U+200A HAIR SPACE
|
|
78
|
+
"\xE2\x80\xA8".b => "\n", # U+2028 LINE SEPARATOR
|
|
79
|
+
"\xE2\x80\xA9".b => "\n", # U+2029 PARAGRAPH SEPARATOR
|
|
80
|
+
# Mac Roman 0xD0–0xD5 misread as Win-1252 → Latin-1 letters.
|
|
81
|
+
# Mac OS Roman uses these byte slots for smart punctuation; when
|
|
82
|
+
# `SasLinter#read_source` interprets a Mac-Roman-authored file
|
|
83
|
+
# as Win-1252 it produces these spurious Latin-1 letters in the
|
|
84
|
+
# post-transcode UTF-8. A typical SAS source corpus (English,
|
|
85
|
+
# with documents that round-tripped through Word on Mac) shows
|
|
86
|
+
# this as Latin-1 letters Ð / Ò / Ó / Ô / Õ standing in for
|
|
87
|
+
# smart-punctuation glyphs.
|
|
88
|
+
#
|
|
89
|
+
# Skipping U+00D1 (Ñ) since it has too much legitimate Spanish-
|
|
90
|
+
# name traffic to safely auto-replace.
|
|
91
|
+
"\xC3\x90".b => "-", # U+00D0 Ð (Mac Roman: en dash)
|
|
92
|
+
"\xC3\x92".b => '"', # U+00D2 Ò (Mac Roman: left double quote)
|
|
93
|
+
"\xC3\x93".b => '"', # U+00D3 Ó (Mac Roman: right double quote)
|
|
94
|
+
"\xC3\x94".b => "'", # U+00D4 Ô (Mac Roman: left single quote)
|
|
95
|
+
"\xC3\x95".b => "'", # U+00D5 Õ (Mac Roman: right single quote)
|
|
96
|
+
}.freeze
|
|
97
|
+
|
|
98
|
+
# Map of single Windows-1252 bytes (0x80-0x9F range, plus 0xA0)
|
|
99
|
+
# to their ASCII replacement. These bytes are invalid as
|
|
100
|
+
# standalone UTF-8 but are how Word documents on legacy Windows
|
|
101
|
+
# render the same characters covered by UTF8_REPLACEMENTS above.
|
|
102
|
+
# Only applied to bytes that are *not* part of a valid UTF-8
|
|
103
|
+
# sequence in context.
|
|
104
|
+
#
|
|
105
|
+
# 0x85 (HORIZONTAL ELLIPSIS) is intentionally absent. In real-world
|
|
106
|
+
# SAS source corpora a standalone 0x85 is overwhelmingly a corrupted
|
|
107
|
+
# Latin-1 letter inside a name (e.g. `\x85` standing in for `Ö`,
|
|
108
|
+
# `Á`, etc.), not a real ellipsis. Mapping it to `...` would corrupt
|
|
109
|
+
# those names. Recovering the proper letter is data-loss recovery
|
|
110
|
+
# and belongs in a separate, source-specific fix configured via the
|
|
111
|
+
# user `replacements:` map.
|
|
112
|
+
BYTE_REPLACEMENTS = {
|
|
113
|
+
0x82 => "'", # SINGLE LOW-9 QUOTATION MARK
|
|
114
|
+
0x91 => "'", # LEFT SINGLE QUOTATION MARK
|
|
115
|
+
0x92 => "'", # RIGHT SINGLE QUOTATION MARK
|
|
116
|
+
0x93 => '"', # LEFT DOUBLE QUOTATION MARK
|
|
117
|
+
0x94 => '"', # RIGHT DOUBLE QUOTATION MARK
|
|
118
|
+
0x96 => "-", # EN DASH
|
|
119
|
+
0x97 => "-", # EM DASH
|
|
120
|
+
0xA0 => " ", # NO-BREAK SPACE
|
|
121
|
+
}.freeze
|
|
122
|
+
|
|
123
|
+
def self.supports_autofix?
|
|
124
|
+
true
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def self.from_config(opts = {})
|
|
128
|
+
opts = opts.transform_keys(&:to_s)
|
|
129
|
+
replacements = (opts["replacements"] || {}).to_h { |k, v| [k.to_s, v.to_s] }
|
|
130
|
+
new(
|
|
131
|
+
use_defaults: opts.fetch("use_defaults", false) ? true : false,
|
|
132
|
+
replacements: replacements,
|
|
133
|
+
autofix: opts["autofix"] ? true : false
|
|
134
|
+
)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
attr_reader :replacements, :use_defaults
|
|
138
|
+
|
|
139
|
+
def initialize(use_defaults: false, replacements: {}, autofix: false)
|
|
140
|
+
super(autofix: autofix)
|
|
141
|
+
@use_defaults = use_defaults
|
|
142
|
+
@replacements = replacements
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def check(_tokens, path:, all_tokens: nil, source: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
146
|
+
return [] unless source
|
|
147
|
+
|
|
148
|
+
findings = []
|
|
149
|
+
findings.concat(default_findings(source, path: path)) if @use_defaults
|
|
150
|
+
findings.concat(replacement_findings(source, path: path)) unless @replacements.empty?
|
|
151
|
+
findings
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def autofix(source)
|
|
155
|
+
# User `replacements:` run FIRST so a project-specific pattern
|
|
156
|
+
# can target byte sequences the canonical defaults would
|
|
157
|
+
# otherwise consume. The motivating case: a SAS source corpus
|
|
158
|
+
# has stray `\x85` bytes inside surnames (a corrupted Latin-1
|
|
159
|
+
# letter) that `SasLinter#read_source` transcodes to
|
|
160
|
+
# `\xE2\x80\xA6` (the UTF-8 ellipsis); a project-specific
|
|
161
|
+
# `"M…LLER": "MÖLLER"` entry only fires if it sees those bytes
|
|
162
|
+
# BEFORE the canonical map rewrites them to `...`.
|
|
163
|
+
#
|
|
164
|
+
# `.b` is required on every side of the `gsub` so mismatched
|
|
165
|
+
# encodings can't blow up `String#gsub` with
|
|
166
|
+
# `Encoding::CompatibilityError`. `source` arrives from
|
|
167
|
+
# `lint_file` as UTF-8; `@replacements` keys/values from YAML
|
|
168
|
+
# are UTF-8; `apply_canonical_fix` returns ASCII-8BIT.
|
|
169
|
+
# Either combination raises when a multi-byte pattern
|
|
170
|
+
# actually matches. `.b` returns a binary-encoded duplicate
|
|
171
|
+
# without altering bytes, so the substitution stays byte-
|
|
172
|
+
# faithful regardless of which direction the mismatch goes.
|
|
173
|
+
step1 = @replacements.inject(source.b) { |s, (from, to)| s.gsub(from.b, to.b) }
|
|
174
|
+
@use_defaults ? apply_canonical_fix(step1) : step1
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Walk `source` as a byte stream applying UTF8_REPLACEMENTS and
|
|
178
|
+
# BYTE_REPLACEMENTS. A Win-1252 byte is replaced only when it's
|
|
179
|
+
# not already part of a valid UTF-8 multibyte sequence — so
|
|
180
|
+
# `M\xC3\x96LLER` (the Ö in `MÖLLER`) survives, but a standalone
|
|
181
|
+
# `\x96` becomes `-`. Returns ASCII-8BIT.
|
|
182
|
+
def apply_canonical_fix(source)
|
|
183
|
+
bytes = source.bytes
|
|
184
|
+
out = String.new(encoding: Encoding::BINARY)
|
|
185
|
+
i = 0
|
|
186
|
+
n = bytes.length
|
|
187
|
+
|
|
188
|
+
while i < n
|
|
189
|
+
b = bytes[i]
|
|
190
|
+
|
|
191
|
+
if b < 0x80
|
|
192
|
+
out << b
|
|
193
|
+
i += 1
|
|
194
|
+
next
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
seq_len = utf8_sequence_length(bytes, i, n)
|
|
198
|
+
if seq_len.positive?
|
|
199
|
+
seq = bytes[i, seq_len].pack("C*")
|
|
200
|
+
replacement = UTF8_REPLACEMENTS[seq]
|
|
201
|
+
out << (replacement ? replacement.b : seq)
|
|
202
|
+
i += seq_len
|
|
203
|
+
else
|
|
204
|
+
replacement = BYTE_REPLACEMENTS[b]
|
|
205
|
+
out << (replacement ? replacement.b : b)
|
|
206
|
+
i += 1
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
out
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
private
|
|
214
|
+
|
|
215
|
+
# Walk the source byte-by-byte. For every byte / sequence in
|
|
216
|
+
# the canonical replacement table, emit a finding tagged with
|
|
217
|
+
# its line:column.
|
|
218
|
+
def default_findings(source, path:)
|
|
219
|
+
findings = []
|
|
220
|
+
bytes = source.b.bytes
|
|
221
|
+
line = 1
|
|
222
|
+
col = 1
|
|
223
|
+
i = 0
|
|
224
|
+
n = bytes.length
|
|
225
|
+
|
|
226
|
+
while i < n
|
|
227
|
+
b = bytes[i]
|
|
228
|
+
|
|
229
|
+
if b < 0x80
|
|
230
|
+
if b == 0x0A
|
|
231
|
+
line += 1
|
|
232
|
+
col = 1
|
|
233
|
+
else
|
|
234
|
+
col += 1
|
|
235
|
+
end
|
|
236
|
+
i += 1
|
|
237
|
+
next
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
seq_len = utf8_sequence_length(bytes, i, n)
|
|
241
|
+
if seq_len.positive?
|
|
242
|
+
seq = bytes[i, seq_len].pack("C*")
|
|
243
|
+
if (replacement = UTF8_REPLACEMENTS[seq])
|
|
244
|
+
findings << finding(
|
|
245
|
+
line: line, column: col,
|
|
246
|
+
message: "UTF-8 #{format('U+%04X', codepoint(seq))} -> #{replacement.inspect}" \
|
|
247
|
+
"#{autofix? ? ' (autofixed)' : ''}",
|
|
248
|
+
path: path
|
|
249
|
+
)
|
|
250
|
+
end
|
|
251
|
+
col += 1
|
|
252
|
+
i += seq_len
|
|
253
|
+
else
|
|
254
|
+
if (replacement = BYTE_REPLACEMENTS[b])
|
|
255
|
+
findings << finding(
|
|
256
|
+
line: line, column: col,
|
|
257
|
+
message: "Windows-1252 0x#{format('%02X', b)} -> #{replacement.inspect}" \
|
|
258
|
+
"#{autofix? ? ' (autofixed)' : ''}",
|
|
259
|
+
path: path
|
|
260
|
+
)
|
|
261
|
+
end
|
|
262
|
+
col += 1
|
|
263
|
+
i += 1
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
findings
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def replacement_findings(source, path:)
|
|
270
|
+
findings = []
|
|
271
|
+
source.each_line.with_index do |line, line_idx|
|
|
272
|
+
chomped = line.sub(/\r?\n\z/, "")
|
|
273
|
+
@replacements.each do |from, to|
|
|
274
|
+
search_from = 0
|
|
275
|
+
while (idx = chomped.index(from, search_from))
|
|
276
|
+
findings << finding(
|
|
277
|
+
line: line_idx + 1,
|
|
278
|
+
column: idx + 1,
|
|
279
|
+
message: "found #{from.inspect}#{autofix? ? " → #{to.inspect} (autofixed)" : ' (no autofix)'}",
|
|
280
|
+
path: path
|
|
281
|
+
)
|
|
282
|
+
search_from = idx + from.length
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
findings
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def codepoint(seq)
|
|
290
|
+
seq.encode("UTF-8", invalid: :replace, undef: :replace).codepoints.first
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Returns the length (1-4) of a valid UTF-8 sequence starting
|
|
294
|
+
# at `bytes[i]`, or 0 if the bytes there aren't a valid UTF-8
|
|
295
|
+
# character. The 1-byte case (ASCII) is handled by the caller
|
|
296
|
+
# before this is invoked, so we only return ≥2 here.
|
|
297
|
+
def utf8_sequence_length(bytes, i, n)
|
|
298
|
+
b = bytes[i]
|
|
299
|
+
return 0 if b < 0xC2
|
|
300
|
+
return valid_continuation?(bytes, i + 1, n) ? 2 : 0 if b < 0xE0
|
|
301
|
+
if b < 0xF0
|
|
302
|
+
return valid_continuation?(bytes, i + 1, n) && valid_continuation?(bytes, i + 2, n) ? 3 : 0
|
|
303
|
+
end
|
|
304
|
+
if b < 0xF5 &&
|
|
305
|
+
valid_continuation?(bytes, i + 1, n) &&
|
|
306
|
+
valid_continuation?(bytes, i + 2, n) &&
|
|
307
|
+
valid_continuation?(bytes, i + 3, n)
|
|
308
|
+
return 4
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
0
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def valid_continuation?(bytes, i, n)
|
|
315
|
+
return false if i >= n
|
|
316
|
+
|
|
317
|
+
b = bytes[i]
|
|
318
|
+
b >= 0x80 && b < 0xC0
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
end
|