pgsqlarbiter 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/pgsqlarbiter/analysis.rb +14 -0
- data/lib/pgsqlarbiter/analyzer.rb +504 -0
- data/lib/pgsqlarbiter/arbiter.rb +84 -0
- data/lib/pgsqlarbiter/default_query_functions.rb +177 -0
- data/lib/pgsqlarbiter/error.rb +18 -0
- data/lib/pgsqlarbiter/keywords.rb +66 -0
- data/lib/pgsqlarbiter/lexer.rb +272 -0
- data/lib/pgsqlarbiter/token.rb +34 -0
- data/lib/pgsqlarbiter/verdict.rb +37 -0
- data/lib/pgsqlarbiter/version.rb +5 -0
- data/lib/pgsqlarbiter.rb +70 -0
- metadata +80 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module Pgsqlarbiter
|
|
6
|
+
# Default set of PostgreSQL functions permitted in queries. This includes functions that could
|
|
7
|
+
# cause resource exhaustion (e.g. generate_series) — resource limits should be enforced elsewhere.
|
|
8
|
+
#
|
|
9
|
+
# Excluded: functions not used by regular queries such as pg_sleep, set_config, lo_*, pg_advisory_lock,
|
|
10
|
+
# pg_notify, sequence functions, and system information functions.
|
|
11
|
+
DEFAULT_QUERY_FUNCTIONS = Set[
|
|
12
|
+
# -- Aggregate functions --
|
|
13
|
+
"array_agg", "avg", "bit_and", "bit_or", "bit_xor",
|
|
14
|
+
"bool_and", "bool_or", "count", "every",
|
|
15
|
+
"json_agg", "jsonb_agg", "json_object_agg", "jsonb_object_agg",
|
|
16
|
+
"max", "min", "range_agg", "range_intersect_agg",
|
|
17
|
+
"string_agg", "sum", "xmlagg",
|
|
18
|
+
|
|
19
|
+
# -- Statistical aggregate functions --
|
|
20
|
+
"corr", "covar_pop", "covar_samp",
|
|
21
|
+
"regr_avgx", "regr_avgy", "regr_count", "regr_intercept",
|
|
22
|
+
"regr_r2", "regr_slope", "regr_sxx", "regr_sxy", "regr_syy",
|
|
23
|
+
"stddev", "stddev_pop", "stddev_samp",
|
|
24
|
+
"variance", "var_pop", "var_samp",
|
|
25
|
+
|
|
26
|
+
# -- Ordered-set aggregate functions --
|
|
27
|
+
"mode", "percentile_cont", "percentile_disc",
|
|
28
|
+
|
|
29
|
+
# -- Window functions --
|
|
30
|
+
"row_number", "rank", "dense_rank", "percent_rank", "cume_dist",
|
|
31
|
+
"ntile", "lag", "lead", "first_value", "last_value", "nth_value",
|
|
32
|
+
|
|
33
|
+
# -- Mathematical functions --
|
|
34
|
+
"abs", "cbrt", "ceil", "ceiling", "degrees", "div",
|
|
35
|
+
"exp", "factorial", "floor", "gcd", "lcm",
|
|
36
|
+
"ln", "log", "log10", "min_scale", "mod",
|
|
37
|
+
"pi", "power", "radians", "random",
|
|
38
|
+
"round", "scale", "sign", "sqrt",
|
|
39
|
+
"trim_scale", "trunc", "width_bucket",
|
|
40
|
+
|
|
41
|
+
# -- Trigonometric functions --
|
|
42
|
+
"acos", "acosd", "asin", "asind",
|
|
43
|
+
"atan", "atan2", "atan2d", "atand",
|
|
44
|
+
"cos", "cosd", "cot", "cotd",
|
|
45
|
+
"sin", "sind", "tan", "tand",
|
|
46
|
+
|
|
47
|
+
# -- Hyperbolic functions --
|
|
48
|
+
"sinh", "cosh", "tanh", "asinh", "acosh", "atanh",
|
|
49
|
+
|
|
50
|
+
# -- String functions --
|
|
51
|
+
"ascii", "btrim", "char_length", "character_length",
|
|
52
|
+
"chr", "concat", "concat_ws",
|
|
53
|
+
"convert", "convert_from", "convert_to",
|
|
54
|
+
"decode", "encode", "format",
|
|
55
|
+
"initcap", "left", "length", "lower",
|
|
56
|
+
"lpad", "ltrim", "md5",
|
|
57
|
+
"normalize", "octet_length", "overlay",
|
|
58
|
+
"parse_ident", "position",
|
|
59
|
+
"quote_ident", "quote_literal", "quote_nullable",
|
|
60
|
+
"regexp_count", "regexp_instr", "regexp_like",
|
|
61
|
+
"regexp_match", "regexp_matches", "regexp_replace",
|
|
62
|
+
"regexp_split_to_array", "regexp_split_to_table", "regexp_substr",
|
|
63
|
+
"repeat", "replace", "reverse", "right",
|
|
64
|
+
"rpad", "rtrim", "split_part",
|
|
65
|
+
"starts_with", "string_to_array", "string_to_table",
|
|
66
|
+
"strpos", "substr", "substring",
|
|
67
|
+
"to_ascii", "to_hex", "translate", "trim",
|
|
68
|
+
"unicode", "unistr", "upper",
|
|
69
|
+
|
|
70
|
+
# -- Binary string functions --
|
|
71
|
+
"bit_length", "get_bit", "get_byte",
|
|
72
|
+
"set_bit", "set_byte",
|
|
73
|
+
"sha224", "sha256", "sha384", "sha512",
|
|
74
|
+
|
|
75
|
+
# -- Date/time functions --
|
|
76
|
+
"age", "clock_timestamp", "date_bin",
|
|
77
|
+
"date_part", "date_trunc", "extract",
|
|
78
|
+
"isfinite", "justify_days", "justify_hours", "justify_interval",
|
|
79
|
+
"make_date", "make_interval", "make_time",
|
|
80
|
+
"make_timestamp", "make_timestamptz",
|
|
81
|
+
"now", "statement_timestamp",
|
|
82
|
+
"timeofday", "transaction_timestamp",
|
|
83
|
+
|
|
84
|
+
# -- Formatting functions --
|
|
85
|
+
"to_char", "to_date", "to_number", "to_timestamp",
|
|
86
|
+
|
|
87
|
+
# -- Conditional functions --
|
|
88
|
+
"coalesce", "nullif", "greatest", "least",
|
|
89
|
+
|
|
90
|
+
# -- Comparison functions --
|
|
91
|
+
"num_nulls", "num_nonnulls",
|
|
92
|
+
|
|
93
|
+
# -- JSON/JSONB functions --
|
|
94
|
+
"to_json", "to_jsonb", "array_to_json", "row_to_json",
|
|
95
|
+
"json_build_array", "jsonb_build_array",
|
|
96
|
+
"json_build_object", "jsonb_build_object",
|
|
97
|
+
"json_object", "jsonb_object",
|
|
98
|
+
"json_array", "jsonb_array",
|
|
99
|
+
"json_array_length", "jsonb_array_length",
|
|
100
|
+
"json_each", "jsonb_each",
|
|
101
|
+
"json_each_text", "jsonb_each_text",
|
|
102
|
+
"json_extract_path", "jsonb_extract_path",
|
|
103
|
+
"json_extract_path_text", "jsonb_extract_path_text",
|
|
104
|
+
"json_object_keys", "jsonb_object_keys",
|
|
105
|
+
"json_populate_record", "jsonb_populate_record",
|
|
106
|
+
"json_populate_recordset", "jsonb_populate_recordset",
|
|
107
|
+
"json_to_record", "jsonb_to_record",
|
|
108
|
+
"json_to_recordset", "jsonb_to_recordset",
|
|
109
|
+
"json_strip_nulls", "jsonb_strip_nulls",
|
|
110
|
+
"jsonb_set", "jsonb_set_lax", "jsonb_insert",
|
|
111
|
+
"jsonb_path_exists", "jsonb_path_match",
|
|
112
|
+
"jsonb_path_query", "jsonb_path_query_array", "jsonb_path_query_first",
|
|
113
|
+
"jsonb_path_exists_tz", "jsonb_path_match_tz",
|
|
114
|
+
"jsonb_path_query_tz", "jsonb_path_query_array_tz", "jsonb_path_query_first_tz",
|
|
115
|
+
"jsonb_pretty",
|
|
116
|
+
"json_typeof", "jsonb_typeof",
|
|
117
|
+
"json_array_elements", "jsonb_array_elements",
|
|
118
|
+
"json_array_elements_text", "jsonb_array_elements_text",
|
|
119
|
+
"json_scalar", "jsonb_scalar",
|
|
120
|
+
"json_table",
|
|
121
|
+
|
|
122
|
+
# -- Array functions --
|
|
123
|
+
"array_append", "array_cat", "array_dims", "array_fill",
|
|
124
|
+
"array_length", "array_lower", "array_ndims",
|
|
125
|
+
"array_position", "array_positions",
|
|
126
|
+
"array_prepend", "array_remove", "array_replace",
|
|
127
|
+
"array_sample", "array_shuffle",
|
|
128
|
+
"array_to_string", "array_upper",
|
|
129
|
+
"cardinality", "trim_array", "unnest",
|
|
130
|
+
|
|
131
|
+
# -- Range/multirange functions --
|
|
132
|
+
"isempty", "lower_inc", "upper_inc", "lower_inf", "upper_inf",
|
|
133
|
+
"range_merge", "multirange",
|
|
134
|
+
"int4range", "int8range", "numrange",
|
|
135
|
+
"tsrange", "tstzrange", "daterange",
|
|
136
|
+
"int4multirange", "int8multirange", "nummultirange",
|
|
137
|
+
"tsmultirange", "tstzmultirange", "datemultirange",
|
|
138
|
+
|
|
139
|
+
# -- Set-returning functions --
|
|
140
|
+
"generate_series", "generate_subscripts",
|
|
141
|
+
|
|
142
|
+
# -- Geometric functions --
|
|
143
|
+
"area", "center", "diagonal", "diameter", "height",
|
|
144
|
+
"isclosed", "isopen", "npoints",
|
|
145
|
+
"pclose", "popen", "radius", "slope", "width",
|
|
146
|
+
"box", "circle", "line", "lseg", "path", "point", "polygon",
|
|
147
|
+
|
|
148
|
+
# -- Network address functions --
|
|
149
|
+
"abbrev", "broadcast", "family",
|
|
150
|
+
"host", "hostmask", "inet_merge", "inet_same_family",
|
|
151
|
+
"masklen", "netmask", "network", "set_masklen",
|
|
152
|
+
|
|
153
|
+
# -- Text search functions --
|
|
154
|
+
"array_to_tsvector", "numnode",
|
|
155
|
+
"plainto_tsquery", "phraseto_tsquery",
|
|
156
|
+
"querytree", "setweight", "strip",
|
|
157
|
+
"to_tsquery", "to_tsvector",
|
|
158
|
+
"ts_delete", "ts_filter", "ts_headline", "ts_lexize",
|
|
159
|
+
"ts_rank", "ts_rank_cd", "ts_rewrite",
|
|
160
|
+
"tsvector_to_array", "websearch_to_tsquery",
|
|
161
|
+
|
|
162
|
+
# -- XML functions --
|
|
163
|
+
"xmlcomment", "xmlconcat", "xmlexists",
|
|
164
|
+
"xmlelement", "xmlforest", "xmlparse", "xmlroot", "xmlserialize",
|
|
165
|
+
"xmltable",
|
|
166
|
+
"xpath", "xpath_exists",
|
|
167
|
+
|
|
168
|
+
# -- Grouping function --
|
|
169
|
+
"grouping",
|
|
170
|
+
|
|
171
|
+
# -- Enum functions --
|
|
172
|
+
"enum_first", "enum_last", "enum_range",
|
|
173
|
+
|
|
174
|
+
# -- UUID functions --
|
|
175
|
+
"gen_random_uuid", "uuidv4", "uuidv7"
|
|
176
|
+
].freeze
|
|
177
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgsqlarbiter
|
|
4
|
+
# Base error class for all pgsqlarbiter errors.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when the lexer encounters invalid syntax (e.g. unterminated strings or unexpected characters).
|
|
8
|
+
class LexError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when the analyzer encounters invalid or unparseable SQL structure.
|
|
11
|
+
class ParseError < Error; end
|
|
12
|
+
|
|
13
|
+
# Raised when the SQL contains more than one statement.
|
|
14
|
+
class MultipleStatementsError < Error; end
|
|
15
|
+
|
|
16
|
+
# Raised when the statement type is not a supported DML type (e.g. DDL, DCL, or TCL).
|
|
17
|
+
class DisallowedStatementError < Error; end
|
|
18
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module Pgsqlarbiter
|
|
6
|
+
module Keywords
|
|
7
|
+
ALLOWED_STATEMENT_TYPES = Set[
|
|
8
|
+
"SELECT", "INSERT", "UPDATE", "DELETE", "MERGE", "VALUES", "WITH"
|
|
9
|
+
].freeze
|
|
10
|
+
|
|
11
|
+
# Keywords that take parentheses but are NOT function calls
|
|
12
|
+
NON_FUNCTION_KEYWORDS = Set[
|
|
13
|
+
"EXISTS", "CASE", "CAST", "IN", "NOT", "ANY", "ALL", "SOME",
|
|
14
|
+
"ARRAY", "ROW", "VALUES", "LATERAL", "TABLE"
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
# Keywords that ARE function calls despite being reserved words
|
|
18
|
+
FUNCTION_KEYWORDS = Set[
|
|
19
|
+
"COALESCE", "NULLIF", "GREATEST", "LEAST", "EXTRACT",
|
|
20
|
+
"TRIM", "SUBSTRING", "OVERLAY", "POSITION", "NORMALIZE", "GROUPING",
|
|
21
|
+
"XMLELEMENT", "XMLFOREST", "XMLPARSE", "XMLROOT", "XMLSERIALIZE"
|
|
22
|
+
].freeze
|
|
23
|
+
|
|
24
|
+
# Complete set of keywords the lexer recognizes
|
|
25
|
+
ALL = Set[
|
|
26
|
+
# Statement types
|
|
27
|
+
"SELECT", "INSERT", "UPDATE", "DELETE", "MERGE", "VALUES", "WITH",
|
|
28
|
+
# Disallowed statement types (rejected by analyzer)
|
|
29
|
+
"CREATE", "DROP", "ALTER", "TRUNCATE", "GRANT", "REVOKE",
|
|
30
|
+
"SHOW", "SET", "RESET",
|
|
31
|
+
"BEGIN", "START", "COMMIT", "ROLLBACK", "SAVEPOINT", "RELEASE",
|
|
32
|
+
"PREPARE", "EXECUTE", "DEALLOCATE",
|
|
33
|
+
"LISTEN", "NOTIFY", "UNLISTEN",
|
|
34
|
+
"LOAD", "COPY", "VACUUM", "ANALYZE", "CLUSTER", "REINDEX",
|
|
35
|
+
"LOCK", "DISCARD", "COMMENT", "SECURITY", "REASSIGN", "REFRESH",
|
|
36
|
+
"IMPORT", "CALL", "DO", "EXPLAIN",
|
|
37
|
+
# Structural keywords
|
|
38
|
+
"FROM", "JOIN", "INNER", "LEFT", "RIGHT", "FULL", "CROSS",
|
|
39
|
+
"NATURAL", "OUTER", "ON", "USING", "INTO", "AS",
|
|
40
|
+
"WHERE", "GROUP", "HAVING", "ORDER", "LIMIT", "OFFSET", "FETCH",
|
|
41
|
+
"UNION", "INTERSECT", "EXCEPT",
|
|
42
|
+
"ALL", "DISTINCT", "LATERAL", "ONLY", "TABLE",
|
|
43
|
+
"RETURNING", "RECURSIVE", "COLUMNS",
|
|
44
|
+
"NOT", "MATERIALIZED",
|
|
45
|
+
"MATCHED", "WHEN", "THEN", "BY", "CONFLICT",
|
|
46
|
+
"AND", "OR", "IS", "IN", "BETWEEN",
|
|
47
|
+
"LIKE", "ILIKE", "SIMILAR",
|
|
48
|
+
"CASE", "CAST", "END", "ELSE",
|
|
49
|
+
"EXISTS", "ANY", "SOME", "ARRAY", "ROW",
|
|
50
|
+
"WINDOW", "OVER", "PARTITION", "WITHIN", "FILTER",
|
|
51
|
+
"NOTHING",
|
|
52
|
+
"FOR", "IF", "ELSE", "TRUE", "FALSE", "NULL",
|
|
53
|
+
"ASC", "DESC", "NULLS", "FIRST", "LAST",
|
|
54
|
+
# Function keywords
|
|
55
|
+
"COALESCE", "NULLIF", "GREATEST", "LEAST", "EXTRACT",
|
|
56
|
+
"TRIM", "SUBSTRING", "OVERLAY", "POSITION", "NORMALIZE", "GROUPING",
|
|
57
|
+
"XMLELEMENT", "XMLFOREST", "XMLPARSE", "XMLROOT", "XMLSERIALIZE",
|
|
58
|
+
# Type keywords (common)
|
|
59
|
+
"INT", "INTEGER", "BIGINT", "SMALLINT", "REAL", "FLOAT",
|
|
60
|
+
"DOUBLE", "PRECISION", "NUMERIC", "DECIMAL",
|
|
61
|
+
"CHAR", "CHARACTER", "VARCHAR", "TEXT",
|
|
62
|
+
"BOOLEAN", "DATE", "TIME", "TIMESTAMP", "INTERVAL",
|
|
63
|
+
"BOTH", "LEADING", "TRAILING"
|
|
64
|
+
].freeze
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "strscan"
|
|
4
|
+
|
|
5
|
+
module Pgsqlarbiter
|
|
6
|
+
# SQL lexer that converts a query string into an array of {Token} objects.
|
|
7
|
+
#
|
|
8
|
+
# Handles all PostgreSQL token types including keywords, identifiers (plain and
|
|
9
|
+
# double-quoted), strings (single-quoted, dollar-quoted, and prefixed), numbers,
|
|
10
|
+
# parameters, operators, and punctuation.
|
|
11
|
+
class Lexer
|
|
12
|
+
include TokenType
|
|
13
|
+
|
|
14
|
+
# Tokenize a SQL query string.
|
|
15
|
+
#
|
|
16
|
+
# @param sql [String] the SQL string to tokenize
|
|
17
|
+
# @return [Array<Token>] list of tokens ending with an EOF token
|
|
18
|
+
# @raise [LexError] on invalid syntax such as unexpected characters or unterminated
|
|
19
|
+
# strings/comments
|
|
20
|
+
def tokenize(sql)
|
|
21
|
+
@scanner = StringScanner.new(sql)
|
|
22
|
+
@tokens = []
|
|
23
|
+
|
|
24
|
+
until @scanner.eos?
|
|
25
|
+
scan_token
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
@tokens << Token.new(type: EOF, value: nil, position: @scanner.pos)
|
|
29
|
+
@tokens
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def scan_token
|
|
35
|
+
pos = @scanner.pos
|
|
36
|
+
|
|
37
|
+
# 1. Whitespace — skip
|
|
38
|
+
return if @scanner.skip(/\s+/)
|
|
39
|
+
|
|
40
|
+
# 2. Line comment — skip
|
|
41
|
+
return if @scanner.skip(/--[^\n]*/)
|
|
42
|
+
|
|
43
|
+
# 3. Block comment — skip (handle nesting)
|
|
44
|
+
if @scanner.scan(/\/\*/)
|
|
45
|
+
scan_block_comment(pos)
|
|
46
|
+
return
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# 4. Dollar-quoted string
|
|
50
|
+
if @scanner.check(/\$([\p{L}_][\p{L}\d_]*)?\$/)
|
|
51
|
+
scan_dollar_string(pos)
|
|
52
|
+
return
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# 5. Parameter placeholder ($1, $2, ...)
|
|
56
|
+
if (m = @scanner.scan(/\$\d+/))
|
|
57
|
+
@tokens << Token.new(type: PARAM, value: m, position: pos)
|
|
58
|
+
return
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# 6. Single-quoted strings (including E'', B'', X'', N'' prefixes)
|
|
62
|
+
# Prefix detection is handled in the identifier branch (step 12)
|
|
63
|
+
if @scanner.check(/'/)
|
|
64
|
+
scan_single_quoted_string(pos, prefix: nil)
|
|
65
|
+
return
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# 7. Double-quoted identifier
|
|
69
|
+
if @scanner.check(/"/)
|
|
70
|
+
scan_quoted_identifier(pos, unicode: false)
|
|
71
|
+
return
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# 8. Numbers
|
|
75
|
+
if (m = @scanner.scan(/0[xX][0-9a-fA-F_]+/))
|
|
76
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
77
|
+
return
|
|
78
|
+
end
|
|
79
|
+
if (m = @scanner.scan(/0[oO][0-7_]+/))
|
|
80
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
81
|
+
return
|
|
82
|
+
end
|
|
83
|
+
if (m = @scanner.scan(/0[bB][01_]+/))
|
|
84
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
85
|
+
return
|
|
86
|
+
end
|
|
87
|
+
if (m = @scanner.scan(/\d[\d_]*\.[\d_]+(?:[eE][+-]?\d[\d_]*)?/))
|
|
88
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
89
|
+
return
|
|
90
|
+
end
|
|
91
|
+
if (m = @scanner.scan(/\d[\d_]*[eE][+-]?\d[\d_]*/))
|
|
92
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
93
|
+
return
|
|
94
|
+
end
|
|
95
|
+
if (m = @scanner.scan(/\d[\d_]*/))
|
|
96
|
+
# Check this isn't followed by dot+digits (which would be a decimal)
|
|
97
|
+
if @scanner.check(/\.[\d_]/)
|
|
98
|
+
m += @scanner.scan(/\.[\d_]+(?:[eE][+-]?\d[\d_]*)?/)
|
|
99
|
+
end
|
|
100
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
101
|
+
return
|
|
102
|
+
end
|
|
103
|
+
if (m = @scanner.scan(/\.[\d_]+(?:[eE][+-]?\d[\d_]*)?/))
|
|
104
|
+
@tokens << Token.new(type: NUMBER, value: m, position: pos)
|
|
105
|
+
return
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# 9. Typecast ::
|
|
109
|
+
if @scanner.scan(/::/)
|
|
110
|
+
@tokens << Token.new(type: TYPECAST, value: "::", position: pos)
|
|
111
|
+
return
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# 10. Single-char punctuation
|
|
115
|
+
ch = @scanner.peek(1)
|
|
116
|
+
case ch
|
|
117
|
+
when "("
|
|
118
|
+
@scanner.getch
|
|
119
|
+
@tokens << Token.new(type: LPAREN, value: "(", position: pos)
|
|
120
|
+
return
|
|
121
|
+
when ")"
|
|
122
|
+
@scanner.getch
|
|
123
|
+
@tokens << Token.new(type: RPAREN, value: ")", position: pos)
|
|
124
|
+
return
|
|
125
|
+
when "["
|
|
126
|
+
@scanner.getch
|
|
127
|
+
@tokens << Token.new(type: LBRACKET, value: "[", position: pos)
|
|
128
|
+
return
|
|
129
|
+
when "]"
|
|
130
|
+
@scanner.getch
|
|
131
|
+
@tokens << Token.new(type: RBRACKET, value: "]", position: pos)
|
|
132
|
+
return
|
|
133
|
+
when ","
|
|
134
|
+
@scanner.getch
|
|
135
|
+
@tokens << Token.new(type: COMMA, value: ",", position: pos)
|
|
136
|
+
return
|
|
137
|
+
when ";"
|
|
138
|
+
@scanner.getch
|
|
139
|
+
@tokens << Token.new(type: SEMICOLON, value: ";", position: pos)
|
|
140
|
+
return
|
|
141
|
+
when "*"
|
|
142
|
+
@scanner.getch
|
|
143
|
+
@tokens << Token.new(type: STAR, value: "*", position: pos)
|
|
144
|
+
return
|
|
145
|
+
when "."
|
|
146
|
+
@scanner.getch
|
|
147
|
+
@tokens << Token.new(type: DOT, value: ".", position: pos)
|
|
148
|
+
return
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# 11. Multi-char operators
|
|
152
|
+
if (m = @scanner.scan(%r{[+\-/<>=~!@#%^&|`?]+}))
|
|
153
|
+
@tokens << Token.new(type: OP, value: m, position: pos)
|
|
154
|
+
return
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# 12. Unquoted identifier / keyword (with string prefix detection)
|
|
158
|
+
if (m = @scanner.scan(/[\p{L}_][\p{L}\d_]*/))
|
|
159
|
+
lower = m.downcase
|
|
160
|
+
|
|
161
|
+
# Check for U& prefix for unicode strings/identifiers
|
|
162
|
+
if lower == "u" && @scanner.check(/&['"]/i)
|
|
163
|
+
@scanner.scan(/&/)
|
|
164
|
+
if @scanner.check(/'/)
|
|
165
|
+
scan_single_quoted_string(pos, prefix: "U&")
|
|
166
|
+
else
|
|
167
|
+
scan_quoted_identifier(pos, unicode: true)
|
|
168
|
+
end
|
|
169
|
+
return
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Check for string prefixes: E, B, X, N immediately followed by '
|
|
173
|
+
if %w[e b x n].include?(lower) && @scanner.check(/'/)
|
|
174
|
+
scan_single_quoted_string(pos, prefix: lower)
|
|
175
|
+
return
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
upper = m.upcase
|
|
179
|
+
if Keywords::ALL.include?(upper)
|
|
180
|
+
@tokens << Token.new(type: KEYWORD, value: upper, position: pos)
|
|
181
|
+
else
|
|
182
|
+
@tokens << Token.new(type: IDENT, value: lower, position: pos)
|
|
183
|
+
end
|
|
184
|
+
return
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# 13. Single colon (not part of ::)
|
|
188
|
+
if @scanner.scan(/:/)
|
|
189
|
+
@tokens << Token.new(type: OP, value: ":", position: pos)
|
|
190
|
+
return
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
raise LexError, "unexpected character #{@scanner.peek(1).inspect} at position #{pos}"
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def scan_block_comment(start_pos)
|
|
197
|
+
depth = 1
|
|
198
|
+
until @scanner.eos?
|
|
199
|
+
if @scanner.scan(/\/\*/)
|
|
200
|
+
depth += 1
|
|
201
|
+
elsif @scanner.scan(/\*\//)
|
|
202
|
+
depth -= 1
|
|
203
|
+
return if depth == 0
|
|
204
|
+
else
|
|
205
|
+
@scanner.getch
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
raise LexError, "unterminated block comment starting at position #{start_pos}"
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def scan_dollar_string(start_pos)
|
|
212
|
+
@scanner.scan(/\$([\p{L}_][\p{L}\d_]*)?\$/)
|
|
213
|
+
tag = @scanner.matched
|
|
214
|
+
content = +""
|
|
215
|
+
until @scanner.eos?
|
|
216
|
+
idx = @scanner.rest.index(tag)
|
|
217
|
+
if idx
|
|
218
|
+
content << @scanner.rest[0, idx]
|
|
219
|
+
@scanner.pos += idx + tag.length
|
|
220
|
+
@tokens << Token.new(type: STRING, value: content, position: start_pos)
|
|
221
|
+
return
|
|
222
|
+
else
|
|
223
|
+
content << @scanner.rest
|
|
224
|
+
@scanner.terminate
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
raise LexError, "unterminated dollar-quoted string starting at position #{start_pos}"
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def scan_single_quoted_string(start_pos, prefix:)
|
|
231
|
+
@scanner.scan(/'/)
|
|
232
|
+
escape_mode = (prefix == "e") # E-strings support backslash escapes
|
|
233
|
+
content = +""
|
|
234
|
+
|
|
235
|
+
until @scanner.eos?
|
|
236
|
+
if escape_mode && @scanner.scan(/\\/)
|
|
237
|
+
if @scanner.eos?
|
|
238
|
+
raise LexError, "unterminated string starting at position #{start_pos}"
|
|
239
|
+
end
|
|
240
|
+
content << "\\" << @scanner.getch
|
|
241
|
+
elsif @scanner.scan(/''/)
|
|
242
|
+
content << "''"
|
|
243
|
+
elsif @scanner.scan(/'/)
|
|
244
|
+
@tokens << Token.new(type: STRING, value: content, position: start_pos)
|
|
245
|
+
return
|
|
246
|
+
else
|
|
247
|
+
content << @scanner.getch
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
raise LexError, "unterminated string starting at position #{start_pos}"
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def scan_quoted_identifier(start_pos, unicode:)
|
|
255
|
+
@scanner.scan(/"/)
|
|
256
|
+
content = +""
|
|
257
|
+
|
|
258
|
+
until @scanner.eos?
|
|
259
|
+
if @scanner.scan(/""/)
|
|
260
|
+
content << '"'
|
|
261
|
+
elsif @scanner.scan(/"/)
|
|
262
|
+
@tokens << Token.new(type: QUOTED_IDENT, value: content, position: start_pos)
|
|
263
|
+
return
|
|
264
|
+
else
|
|
265
|
+
content << @scanner.getch
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
raise LexError, "unterminated quoted identifier starting at position #{start_pos}"
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgsqlarbiter
|
|
4
|
+
# Immutable token produced by the {Lexer}.
|
|
5
|
+
#
|
|
6
|
+
# @!attribute [r] type
|
|
7
|
+
# @return [Symbol] token type (one of the {TokenType} constants)
|
|
8
|
+
# @!attribute [r] value
|
|
9
|
+
# @return [String, nil] the token text (+nil+ for EOF)
|
|
10
|
+
# @!attribute [r] position
|
|
11
|
+
# @return [Integer] character offset in the original SQL string
|
|
12
|
+
Token = Data.define(:type, :value, :position)
|
|
13
|
+
|
|
14
|
+
# Constants for all token types produced by the {Lexer}.
|
|
15
|
+
module TokenType
|
|
16
|
+
KEYWORD = :keyword
|
|
17
|
+
IDENT = :ident
|
|
18
|
+
QUOTED_IDENT = :quoted_ident
|
|
19
|
+
STRING = :string
|
|
20
|
+
NUMBER = :number
|
|
21
|
+
PARAM = :param
|
|
22
|
+
LPAREN = :lparen
|
|
23
|
+
RPAREN = :rparen
|
|
24
|
+
LBRACKET = :lbracket
|
|
25
|
+
RBRACKET = :rbracket
|
|
26
|
+
COMMA = :comma
|
|
27
|
+
DOT = :dot
|
|
28
|
+
SEMICOLON = :semicolon
|
|
29
|
+
STAR = :star
|
|
30
|
+
TYPECAST = :typecast
|
|
31
|
+
OP = :op
|
|
32
|
+
EOF = :eof
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgsqlarbiter
|
|
4
|
+
# Immutable result of judging a SQL query against an {Arbiter}'s rules.
|
|
5
|
+
#
|
|
6
|
+
# A +Verdict+ tells you whether a query is allowed and, if not, exactly which
|
|
7
|
+
# checks failed. Use {#allowed?} for a quick boolean, {#reasons} for
|
|
8
|
+
# human-readable denial strings, or the individual fields for programmatic
|
|
9
|
+
# branching.
|
|
10
|
+
#
|
|
11
|
+
# @!attribute [r] allowed
|
|
12
|
+
# @return [Boolean] +true+ when the query passes all checks
|
|
13
|
+
# @!attribute [r] statement_type_allowed
|
|
14
|
+
# @return [Boolean] +true+ when the statement type is in the whitelist
|
|
15
|
+
# @!attribute [r] statement_type
|
|
16
|
+
# @return [Symbol] the query's actual statement type
|
|
17
|
+
# @!attribute [r] disallowed_tables
|
|
18
|
+
# @return [Array<String>] tables referenced by the query that are not whitelisted
|
|
19
|
+
# @!attribute [r] disallowed_functions
|
|
20
|
+
# @return [Array<String>] functions called by the query that are not whitelisted
|
|
21
|
+
Verdict = Data.define(:allowed, :statement_type_allowed, :statement_type,
|
|
22
|
+
:disallowed_tables, :disallowed_functions) do
|
|
23
|
+
alias_method :allowed?, :allowed
|
|
24
|
+
alias_method :statement_type_allowed?, :statement_type_allowed
|
|
25
|
+
|
|
26
|
+
# Human-readable denial reasons. Empty when the query is allowed.
|
|
27
|
+
#
|
|
28
|
+
# @return [Array<String>] frozen list of reason strings
|
|
29
|
+
def reasons
|
|
30
|
+
r = []
|
|
31
|
+
r << "statement type :#{statement_type} is not allowed" unless statement_type_allowed
|
|
32
|
+
disallowed_tables.each { |t| r << "table #{t.inspect} is not allowed" }
|
|
33
|
+
disallowed_functions.each { |f| r << "function #{f.inspect} is not allowed" }
|
|
34
|
+
r.freeze
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|