sas-lexer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,489 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ffi"
4
+
5
+ module SasLexer
6
+ # Ruby wrapper around the `sas-lexer` Rust crate, accessed through
7
+ # the small C ABI shim built from `ffi-wrapper/`.
8
+ #
9
+ # Loads the prebuilt shared library shipped under `lib/native/` (or
10
+ # the dev-build flat path produced by `rake sas_lexer:install`).
11
+ class Lexer
12
+ extend FFI::Library
13
+
14
+ gem_root = File.expand_path("../..", __dir__)
15
+ lib_native_dir = File.join(gem_root, "lib", "native")
16
+
17
+ host_os = case RbConfig::CONFIG["host_os"]
18
+ when /darwin/ then "darwin"
19
+ when /linux/ then "linux"
20
+ when /mswin|mingw|cygwin/ then "windows"
21
+ else
22
+ raise SasLexer::Error,
23
+ "Unsupported host OS: #{RbConfig::CONFIG["host_os"]}"
24
+ end
25
+
26
+ library_ext = { "darwin" => "dylib", "linux" => "so", "windows" => "dll" }.fetch(host_os)
27
+ host_platform = "#{RbConfig::CONFIG["host_cpu"]}-#{host_os}"
28
+
29
+ # Probe order:
30
+ # 1. `lib/native/<host_cpu>-<host_os>/libsas_lexer_ffi.<ext>` —
31
+ # prebuilt artifact shipped inside the published universal gem
32
+ # for the host's exact platform.
33
+ # 2. `lib/native/libsas_lexer_ffi.<ext>` — flat path produced
34
+ # by `bundle exec rake sas_lexer:install` for local
35
+ # development.
36
+ LIBRARY_PATH = [
37
+ File.join(lib_native_dir, host_platform, "libsas_lexer_ffi.#{library_ext}"),
38
+ File.join(lib_native_dir, "libsas_lexer_ffi.#{library_ext}"),
39
+ ].find { |path| File.exist?(path) }
40
+
41
+ if LIBRARY_PATH.nil?
42
+ raise SasLexer::Error,
43
+ "Could not find a prebuilt sas-lexer FFI library at " \
44
+ "lib/native/#{host_platform}/libsas_lexer_ffi.#{library_ext}. " \
45
+ "Build one with `bundle exec rake sas_lexer:install` " \
46
+ "or add a prebuilt for #{host_platform} under lib/native/."
47
+ end
48
+
49
+ ffi_lib LIBRARY_PATH
50
+
51
+ # FFI struct mirroring the `SasToken` C struct in `ffi-wrapper/src/lib.rs`.
52
+ class Token < FFI::Struct
53
+ layout :token_type, :uint32,
54
+ :channel, :uint8,
55
+ :start, :size_t,
56
+ :end, :size_t,
57
+ :start_line, :uint32,
58
+ :end_line, :uint32,
59
+ :start_column, :uint32,
60
+ :end_column, :uint32
61
+ end
62
+
63
+ attach_function :sas_lexer_new, [], :pointer
64
+ attach_function :sas_lexer_free, [:pointer], :void
65
+ attach_function :sas_lexer_tokenize, [:pointer, :string], :int
66
+ attach_function :sas_lexer_token_count, [:pointer], :size_t
67
+ attach_function :sas_lexer_get_token, [:pointer, :size_t, :pointer], :int
68
+ attach_function :sas_lexer_get_token_text, [:pointer, :size_t], :pointer
69
+ attach_function :sas_lexer_free_string, [:pointer], :void
70
+ attach_function :sas_lexer_get_last_error, [], :pointer
71
+ attach_function :sas_lexer_clear_error, [], :void
72
+
73
+ # Error code values returned by the C ABI.
74
+ module ErrorCode
75
+ SUCCESS = 0
76
+ NULL_POINTER = 1
77
+ INVALID_UTF8 = 2
78
+ LEXING_ERROR = 3
79
+ INDEX_OUT_OF_BOUNDS = 4
80
+ TOKEN_NOT_FOUND = 5
81
+ BUFFER_NOT_INITIALIZED = 6
82
+ end
83
+
84
+ # Token channels — match the `TokenChannel` enum in the Rust crate.
85
+ module TokenChannel
86
+ DEFAULT = 0 # Most tokens (keywords, identifiers, operators, etc.)
87
+ HIDDEN = 1 # Whitespace and other insignificant tokens
88
+ COMMENT = 2 # All comment tokens
89
+ end
90
+
91
+ # Token type constants — match the `TokenType` enum in the Rust crate.
92
+ # Full enum: https://github.com/mishamsk/sas-lexer/blob/main/crates/sas-lexer/src/lexer/token_type.rs
93
+ module TokenType
94
+ # Special tokens
95
+ EOF = 0
96
+ MACRO_SEP = 1
97
+ CATCH_ALL = 2
98
+ WS = 3
99
+ WHITESPACE = 3 # Alias for WS
100
+
101
+ # Punctuation and operators
102
+ SEMI = 4 # ';'
103
+ AMP = 5 # '&'
104
+ PERCENT = 6 # '%'
105
+ LPAREN = 7 # '('
106
+ RPAREN = 8 # ')'
107
+ LCURLY = 9 # '{'
108
+ RCURLY = 10 # '}'
109
+ LBRACK = 11 # '['
110
+ RBRACK = 12 # ']'
111
+ STAR = 13 # '*'
112
+ EXCL = 14 # '!'
113
+ EXCL2 = 15 # '!!'
114
+ BPIPE = 16 # '!|'
115
+ BPIPE2 = 17 # '!|!'
116
+ PIPE2 = 18 # '||'
117
+ STAR2 = 19 # '**'
118
+ NOT = 20 # '¬' or '^'
119
+ FSLASH = 21 # '/'
120
+ PLUS = 22 # '+'
121
+ MINUS = 23 # '-'
122
+ GTLT = 24 # '><'
123
+ LTGT = 25 # '<>'
124
+ LT = 26 # '<'
125
+ LE = 27 # '<='
126
+ NE = 28 # '!=' or '^='
127
+ GT = 29 # '>'
128
+ GE = 30 # '>='
129
+ SOUNDS_LIKE = 31 # '=*'
130
+ PIPE = 32 # '|'
131
+ DOT = 33 # '.'
132
+ COMMA = 34 # ','
133
+ COLON = 35 # ':'
134
+ ASSIGN = 36 # '='
135
+ DOLLAR = 37 # '$'
136
+ AT = 38 # '@'
137
+ HASH = 39 # '#'
138
+ QUESTION = 40 # '?'
139
+
140
+ # Logical operators (mnemonic keywords)
141
+ KW_LT = 41 # LT
142
+ KW_LE = 42 # LE
143
+ KW_EQ = 43 # EQ
144
+ KW_IN = 44 # IN
145
+ KW_NE = 45 # NE
146
+ KW_GT = 46 # GT
147
+ KW_GE = 47 # GE
148
+ KW_AND = 48 # AND
149
+ KW_OR = 49 # OR
150
+ KW_NOT = 50 # NOT
151
+
152
+ # Literals
153
+ INTEGER_LITERAL = 51
154
+ FLOAT_LITERAL = 52
155
+ FLOAT_EXPONENT_LITERAL = 53
156
+ STRING_LITERAL = 54
157
+ BIT_TESTING_LITERAL = 55
158
+ DATE_LITERAL = 56
159
+ DATE_TIME_LITERAL = 57
160
+ NAME_LITERAL = 58
161
+ TIME_LITERAL = 59
162
+ HEX_STRING_LITERAL = 60
163
+ STRING_EXPR_START = 61
164
+ STRING_EXPR_TEXT = 62
165
+ STRING_EXPR_END = 63
166
+ BIT_TESTING_LITERAL_EXPR_END = 64
167
+ DATE_LITERAL_EXPR_END = 65
168
+ DATE_TIME_LITERAL_EXPR_END = 66
169
+ NAME_LITERAL_EXPR_END = 67
170
+ TIME_LITERAL_EXPR_END = 68
171
+ HEX_STRING_LITERAL_EXPR_END = 69
172
+
173
+ # Comments
174
+ C_STYLE_COMMENT = 70 # /* ... */
175
+ PREDICTED_COMMENT_STAT = 71 # * ...; and ** ... **;
176
+ COMMENT_STAT = 71 # Alias for PREDICTED_COMMENT_STAT
177
+ DATALINES_START = 72
178
+ DATALINES_DATA = 73
179
+ CHAR_FORMAT = 74
180
+ MACRO_COMMENT = 75 # %* ...;
181
+ MACRO_VAR_RESOLVE = 76
182
+ MACRO_VAR_TERM = 77
183
+ MACRO_STRING = 78
184
+ MACRO_STRING_EMPTY = 79
185
+ MACRO_LABEL = 80
186
+ MACRO_IDENTIFIER = 81
187
+
188
+ # Macro functions
189
+ KWM_CMPRES = 82
190
+ KWM_COMPSTOR = 83
191
+ KWM_DATATYP = 84
192
+ KWM_EVAL = 85
193
+ KWM_INDEX = 86
194
+ KWM_LEFT = 87
195
+ KWM_LENGTH = 88
196
+ KWM_LOWCASE = 89
197
+ KWM_SCAN = 90
198
+ KWM_SUBSTR = 91
199
+ KWM_SYM_EXIST = 92
200
+ KWM_SYM_GLOBL = 93
201
+ KWM_SYM_LOCAL = 94
202
+ KWM_SYSEVALF = 95
203
+ KWM_SYSFUNC = 96
204
+ KWM_SYSGET = 97
205
+ KWM_SYSMACEXEC = 98
206
+ KWM_SYSMACEXIST = 99
207
+ KWM_SYSMEXECDEPTH = 100
208
+ KWM_SYSMEXECNAME = 101
209
+ KWM_SYSPROD = 102
210
+ KWM_TRIM = 103
211
+ KWM_UNQUOTE = 104
212
+ KWM_UPCASE = 105
213
+ KWM_VERIFY = 106
214
+ KWM_K_CMPRES = 107
215
+ KWM_K_INDEX = 108
216
+ KWM_K_LEFT = 109
217
+ KWM_K_LENGTH = 110
218
+ KWM_K_LOWCASE = 111
219
+ KWM_K_SCAN = 112
220
+ KWM_K_SUBSTR = 113
221
+ KWM_K_TRIM = 114
222
+ KWM_K_UPCASE = 115
223
+ KWM_K_VERIFY = 116
224
+ KWM_VALIDCHS = 117
225
+ KWM_Q_CMPRES = 118
226
+ KWM_Q_LEFT = 119
227
+ KWM_Q_LOWCASE = 120
228
+ KWM_Q_SCAN = 121
229
+ KWM_Q_SUBSTR = 122
230
+ KWM_Q_TRIM = 123
231
+ KWM_Q_SYSFUNC = 124
232
+ KWM_Q_UPCASE = 125
233
+ KWM_QK_CMPRES = 126
234
+ KWM_QK_LEFT = 127
235
+ KWM_QK_LOWCASE = 128
236
+ KWM_QK_SCAN = 129
237
+ KWM_QK_SUBSTR = 130
238
+ KWM_QK_TRIM = 131
239
+ KWM_QK_UPCASE = 132
240
+ KWM_BQUOTE = 133
241
+ KWM_NR_BQUOTE = 134
242
+ KWM_NR_QUOTE = 135
243
+ KWM_QUOTE = 136
244
+ KWM_SUPERQ = 137
245
+ KWM_STR = 138
246
+ KWM_NR_STR = 139
247
+
248
+ # Macro statements
249
+ KWM_ABORT = 140
250
+ KWM_COPY = 141
251
+ KWM_DISPLAY = 142
252
+ KWM_DO = 143
253
+ KWM_TO = 144
254
+ KWM_BY = 145
255
+ KWM_UNTIL = 146
256
+ KWM_WHILE = 147
257
+ KWM_END = 148
258
+ KWM_GLOBAL = 149
259
+ KWM_GOTO = 150
260
+ KWM_IF = 151
261
+ KWM_THEN = 152
262
+ KWM_ELSE = 153
263
+ KWM_INPUT = 154
264
+ KWM_LET = 155
265
+ KWM_LOCAL = 156
266
+ KWM_MACRO = 157
267
+ KWM_MEND = 158
268
+ KWM_PUT = 159
269
+ KWM_RETURN = 160
270
+ KWM_SYMDEL = 161
271
+ KWM_SYSCALL = 162
272
+ KWM_SYSEXEC = 163
273
+ KWM_SYSLPUT = 164
274
+ KWM_SYSMACDELETE = 165
275
+ KWM_SYSMSTORECLEAR = 166
276
+ KWM_SYSRPUT = 167
277
+ KWM_WINDOW = 168
278
+ KWM_INCLUDE = 169
279
+ KWM_LIST = 170
280
+ KWM_RUN = 171
281
+
282
+ # Identifiers (variable names, dataset names, etc.)
283
+ IDENTIFIER = 172
284
+
285
+ # Additional keyword operators
286
+ KW_EQT = 173
287
+ KW_GTT = 174
288
+ KW_LTT = 175
289
+ KW_GET = 176
290
+ KW_LET = 177
291
+ KW_NET = 178
292
+
293
+ # SAS statement keywords
294
+ KW_LIBNAME = 179
295
+ KW_FILENAME = 180
296
+ KW_CLEAR = 181
297
+ KW_LIST = 182
298
+ KW_CANCEL = 183
299
+ KW_ALL_VAR = 184
300
+ KW_ARRAY = 185
301
+ KW_ATTRIB = 186
302
+ KW_CALL = 187
303
+ KW_DATA = 188
304
+ KW_DEFAULT = 189
305
+ KW_DESCENDING = 190
306
+ KW_FORMAT = 191
307
+ KW_GROUPFORMAT = 192
308
+ KW_ID = 193
309
+ KW_IF = 194
310
+ KW_INFILE = 195
311
+ KW_INFORMAT = 196
312
+ KW_KEEP = 197
313
+ KW_LABEL = 198
314
+ KW_LENGTH = 199
315
+ KW_MERGE = 200
316
+ KW_NULL_DATASET = 201
317
+ KW_OUTPUT = 202
318
+ KW_PGM = 203
319
+ KW_RENAME = 204
320
+ KW_RUN = 205
321
+ KW_SET = 206
322
+ KW_STOP = 207
323
+ KW_VAR = 208
324
+ KW_VIEW = 209
325
+ KW_WITH = 210
326
+ KW_DELETE = 211
327
+ KW_NOTSORTED = 212
328
+ KW_PROC = 213
329
+ KW_QUIT = 214
330
+ KW_RANKS = 215
331
+
332
+ # SQL and other SAS keywords
333
+ KW_ALL = 216
334
+ KW_ANY = 217
335
+ KW_AS = 218
336
+ KW_ASC = 219
337
+ KW_BETWEEN = 220
338
+ KW_BOTH = 221
339
+ KW_BTRIM = 222
340
+ KW_BY = 223
341
+ KW_CALCULATED = 224
342
+ KW_CASE = 225
343
+ KW_CONNECT = 226
344
+ KW_CONNECTION = 227
345
+ KW_CONTAINS = 228
346
+ KW_CORR = 229
347
+ KW_CREATE = 230
348
+ KW_CROSS = 231
349
+ KW_DESC = 232
350
+ KW_DISCONNECT = 233
351
+ KW_DISTINCT = 234
352
+ KW_DO = 235
353
+ KW_DROP = 236
354
+ KW_ELSE = 237
355
+ KW_END = 238
356
+ KW_ESCAPE = 239
357
+ KW_EXCEPT = 240
358
+ KW_EXECUTE = 241
359
+ KW_EXISTS = 242
360
+ KW_FOR = 243
361
+ KW_FROM = 244
362
+ KW_FULL = 245
363
+ KW_GROUP = 246
364
+ KW_HAVING = 247
365
+ KW_INDEX = 248
366
+ KW_INNER = 249
367
+ KW_INSERT = 250
368
+ KW_INTERSECT = 251
369
+ KW_INTO = 252
370
+ KW_IS = 253
371
+ KW_JOIN = 254
372
+ KW_KEY = 255
373
+ KW_LEADING = 256
374
+ KW_LEFT = 257
375
+ KW_LIKE = 258
376
+ KW_MISSING = 259
377
+ KW_NATURAL = 260
378
+ KW_NOTRIM = 261
379
+ KW_NULL = 262
380
+ KW_ON = 263
381
+ KW_ORDER = 264
382
+ KW_OUTER = 265
383
+ KW_PRIMARY = 266
384
+ KW_RIGHT = 267
385
+ KW_SELECT = 268
386
+ KW_SEPARATED = 269
387
+ KW_SUBSTRING = 270
388
+ KW_TABLE = 271
389
+ KW_THEN = 272
390
+ KW_TO = 273
391
+ KW_TRAILING = 274
392
+ KW_TRIMMED = 275
393
+ KW_UNION = 276
394
+ KW_UNIQUE = 277
395
+ KW_UPDATE = 278
396
+ KW_USING = 279
397
+ KW_VALUES = 280
398
+ KW_WHEN = 281
399
+ KW_WHERE = 282
400
+ KW_DECLARE = 283
401
+ KW_HASH = 284
402
+ KW_HITER = 285
403
+ KW_INPUT = 286
404
+ KW_PUT = 287
405
+ end
406
+
407
+ def initialize
408
+ @lexer_ptr = self.class.sas_lexer_new
409
+ raise SasLexer::Error, "Failed to create SAS lexer" if @lexer_ptr.null?
410
+ end
411
+
412
+ def tokenize(sas_code)
413
+ raise SasLexer::Error, "Lexer has been freed" if @lexer_ptr.null?
414
+ raise SasLexer::Error, "Null pointer provided" if sas_code.nil?
415
+
416
+ result = self.class.sas_lexer_tokenize(@lexer_ptr, sas_code)
417
+
418
+ if result != ErrorCode::SUCCESS
419
+ error_msg = get_last_error_message
420
+ raise SasLexer::Error, error_msg || "Failed to tokenize SAS code (error code: #{result})"
421
+ end
422
+
423
+ token_count = self.class.sas_lexer_token_count(@lexer_ptr)
424
+
425
+ tokens = []
426
+
427
+ (0...token_count).each do |index|
428
+ text_ptr = self.class.sas_lexer_get_token_text(@lexer_ptr, index)
429
+
430
+ next if text_ptr.null?
431
+
432
+ text = text_ptr.read_string
433
+
434
+ token_struct = Token.new
435
+ token_result = self.class.sas_lexer_get_token(@lexer_ptr, index, token_struct)
436
+
437
+ if token_result == ErrorCode::SUCCESS
438
+ tokens << {
439
+ index: index,
440
+ text: text,
441
+ type: token_struct[:token_type],
442
+ channel: token_struct[:channel],
443
+ start: token_struct[:start],
444
+ end: token_struct[:end],
445
+ start_line: token_struct[:start_line],
446
+ end_line: token_struct[:end_line],
447
+ start_column: token_struct[:start_column],
448
+ end_column: token_struct[:end_column]
449
+ }
450
+ else
451
+ tokens << {
452
+ index: index,
453
+ text: text,
454
+ type: nil,
455
+ channel: nil
456
+ }
457
+ end
458
+
459
+ self.class.sas_lexer_free_string(text_ptr)
460
+ end
461
+
462
+ tokens
463
+ end
464
+
465
+ def free
466
+ return if @lexer_ptr.null?
467
+
468
+ self.class.sas_lexer_free(@lexer_ptr)
469
+ @lexer_ptr = FFI::Pointer::NULL
470
+ end
471
+
472
+ def finalize
473
+ free
474
+ end
475
+
476
+ private
477
+
478
+ def get_last_error_message
479
+ error_ptr = self.class.sas_lexer_get_last_error
480
+ return nil if error_ptr.null?
481
+
482
+ begin
483
+ error_ptr.read_string
484
+ ensure
485
+ self.class.sas_lexer_free_string(error_ptr)
486
+ end
487
+ end
488
+ end
489
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SasLexer
4
+ VERSION = "0.1.0"
5
+ end
data/lib/sas_lexer.rb ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "sas_lexer/version"
4
+ require_relative "sas_lexer/error"
5
+ require_relative "sas_lexer/lexer"
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sas-lexer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Craig McNamara
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: ffi
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.15'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '1.15'
26
+ description: |
27
+ A Ruby FFI binding to the `sas-lexer` Rust crate by Misha Perlov
28
+ (https://github.com/mishamsk/sas-lexer). Tokenizes SAS source code
29
+ into a stream of typed tokens with full position metadata. Ships
30
+ prebuilt native libraries for supported platforms; a runtime FFI
31
+ loader picks the matching one for the host.
32
+ email:
33
+ - craig@monami.io
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - LICENSE
39
+ - README.md
40
+ - Rakefile
41
+ - ffi-wrapper/Cargo.lock
42
+ - ffi-wrapper/Cargo.toml
43
+ - ffi-wrapper/src/lib.rs
44
+ - lib/native/arm64-darwin/libsas_lexer_ffi.dylib
45
+ - lib/native/x86_64-linux/libsas_lexer_ffi.so
46
+ - lib/sas_lexer.rb
47
+ - lib/sas_lexer/error.rb
48
+ - lib/sas_lexer/lexer.rb
49
+ - lib/sas_lexer/version.rb
50
+ homepage: https://github.com/mes-amis/sas-lexer-rb
51
+ licenses:
52
+ - AGPL-3.0-or-later
53
+ metadata:
54
+ homepage_uri: https://github.com/mes-amis/sas-lexer-rb
55
+ rubygems_mfa_required: 'true'
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: 3.4.0
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubygems_version: 4.0.6
71
+ specification_version: 4
72
+ summary: Ruby FFI wrapper around the sas-lexer Rust crate.
73
+ test_files: []