eden 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/CHANGELOG +4 -0
  2. data/LICENSE +20 -0
  3. data/README.md +48 -0
  4. data/Rakefile +10 -0
  5. data/bin/eden +132 -0
  6. data/lib/eden.rb +10 -0
  7. data/lib/eden/defaults.rb +26 -0
  8. data/lib/eden/formatter.rb +25 -0
  9. data/lib/eden/formatters/block_formatter.rb +45 -0
  10. data/lib/eden/formatters/indenter.rb +91 -0
  11. data/lib/eden/formatters/white_space_cleaner.rb +14 -0
  12. data/lib/eden/line.rb +65 -0
  13. data/lib/eden/source_file.rb +32 -0
  14. data/lib/eden/token.rb +62 -0
  15. data/lib/eden/tokenizer.rb +259 -0
  16. data/lib/eden/tokenizers/basic_tokenizer.rb +167 -0
  17. data/lib/eden/tokenizers/delimited_literal_tokenizer.rb +38 -0
  18. data/lib/eden/tokenizers/number_tokenizer.rb +68 -0
  19. data/lib/eden/tokenizers/operator_tokenizer.rb +211 -0
  20. data/lib/eden/tokenizers/regex_tokenizer.rb +37 -0
  21. data/lib/eden/tokenizers/string_tokenizer.rb +149 -0
  22. data/test/array_literal_tokenization_test.rb +43 -0
  23. data/test/basic_tokenization_test.rb +29 -0
  24. data/test/block_formatter_test.rb +47 -0
  25. data/test/class_var_token_test.rb +21 -0
  26. data/test/identifier_token_test.rb +140 -0
  27. data/test/indenter_test.rb +314 -0
  28. data/test/instance_var_token_test.rb +48 -0
  29. data/test/number_tokenization_test.rb +83 -0
  30. data/test/operator_tokenization_test.rb +180 -0
  31. data/test/regex_tokenization_test.rb +68 -0
  32. data/test/single_character_tokenization_test.rb +87 -0
  33. data/test/string_tokenization_test.rb +291 -0
  34. data/test/symbol_tokenization_test.rb +64 -0
  35. data/test/test_helper.rb +13 -0
  36. data/test/white_space_cleaner_test.rb +35 -0
  37. data/test/whitespace_token_test.rb +63 -0
  38. metadata +108 -0
@@ -0,0 +1,32 @@
1
+ module Eden
2
+ class SourceFile
3
+ attr_accessor :source, :lines
4
+
5
+ def initialize( file_name )
6
+ @file_name = file_name
7
+ @lines = []
8
+ end
9
+
10
+ def load!
11
+ file = File.open( @file_name, "r" )
12
+ @source = file.read
13
+ end
14
+
15
+ def tokenize!
16
+ tokenizer = Tokenizer.new( self )
17
+ tokenizer.tokenize!
18
+ end
19
+
20
+ def each_line
21
+ @lines.each { |l| yield l }
22
+ end
23
+
24
+ def rewrite!
25
+ File.open(@file_name, 'w') do |f|
26
+ each_line do |l|
27
+ f.write l.joined_tokens
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
data/lib/eden/token.rb ADDED
@@ -0,0 +1,62 @@
1
+ module Eden
2
+ class Token
3
+ attr_accessor :type, :content
4
+
5
+ BINARY_OPERATORS = [:matches, :identity_equality, :equality,
6
+ :not_equals, :not_matches, :plus_equals, :plus,
7
+ :plus_at, :plus, :minus_equals, :minus_at, :minus,
8
+ :exponent_equals, :exponent, :multiply_equals, :multiply,
9
+ :divide, :divide_equals,
10
+ :left_shift_equals, :left_shift, :lte, :lt,
11
+ :right_shift_equals, :right_shift, :gte, :gt,
12
+ :sort_operator,
13
+ :logical_or_equals, :logical_or,
14
+ :bitwise_or_equals, :bitwise_or,
15
+ :logical_and_equals, :logical_and,
16
+ :bitwise_and_equals, :bitwise_and]
17
+
18
+ UNARY_OPERATORS = [:plus, :minus, :multiply, :logical_not, :tilde]
19
+
20
+ KEYWORDS = [:__LINE__, :__ENCODING__, :__FILE__, :BEGIN,
21
+ :END, :alias, :and, :begin, :break, :case,
22
+ :class, :def, :defined?, :do, :else, :elsif,
23
+ :end, :ensure, :false, :for, :if, :in,
24
+ :module, :next, :nil, :not, :or, :redo,
25
+ :rescue, :retry, :return, :self, :super,
26
+ :then, :true, :undef, :unless, :until,
27
+ :when, :while, :yield]
28
+
29
+ def initialize( type, content )
30
+ @type = type
31
+ @content = content
32
+ end
33
+
34
+ def inspect
35
+ if @content.nil? || @content == "\n"
36
+ @type.to_s
37
+ else
38
+ @type.to_s + "- \"" + @content + "\""
39
+ end
40
+ end
41
+
42
+ def operator?
43
+ binary_operator? || unary_operator?
44
+ end
45
+
46
+ def unary_operator?
47
+ UNARY_OPERATORS.include?( type )
48
+ end
49
+
50
+ def binary_operator?
51
+ BINARY_OPERATORS.include?( type )
52
+ end
53
+
54
+ def keyword?
55
+ KEYWORDS.include?( type )
56
+ end
57
+
58
+ def is?( token_type )
59
+ @type == token_type
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,259 @@
1
+ require 'eden/tokenizers/basic_tokenizer'
2
+ require 'eden/tokenizers/delimited_literal_tokenizer'
3
+ require 'eden/tokenizers/number_tokenizer'
4
+ require 'eden/tokenizers/operator_tokenizer'
5
+ require 'eden/tokenizers/regex_tokenizer'
6
+ require 'eden/tokenizers/string_tokenizer'
7
+
8
+
9
+ module Eden
10
+ class Tokenizer
11
+ include BasicTokenizer
12
+ include DelimitedLiteralTokenizer
13
+ include NumberTokenizer
14
+ include OperatorTokenizer
15
+ include RegexTokenizer
16
+ include StringTokenizer
17
+
18
+ def initialize( source_file )
19
+ @sf = source_file
20
+ @interpolating = [] # Stack for state when interpolating into strings
21
+ @delimiters = [] # Stack for delimiters which we need to keep when interpolating
22
+ end
23
+
24
+ def tokenize!
25
+ @i = 0 # Current position in the source buffer
26
+ @ln = 1 # Line Number
27
+ @cp = 0 # Current Character in the line
28
+ @thunk_st = 0
29
+ @thunk_end = -1 # Start/end of the current token
30
+ @current_line = Line.new( @ln )
31
+ @length = @sf.source.length
32
+ @expr_state = :beg # Same as lex_state variable in parse.c in Ruby source
33
+ default_state_transitions!
34
+
35
+ until( @i >= @length )
36
+ case( @state )
37
+ when :newline
38
+ advance
39
+ @expr_state = :beg
40
+ @current_line.tokens << capture_token( :newline )
41
+ @current_line.tokens.flatten!
42
+ @sf.lines << @current_line
43
+ @ln += 1
44
+ @current_line = Line.new( @ln )
45
+
46
+ if @heredoc_delimiter
47
+ @current_line.tokens << tokenize_heredoc_body
48
+ end
49
+ when :whitespace
50
+ @current_line.tokens << tokenize_whitespace
51
+ when :identifier # keyword / name / etc
52
+ @current_line.tokens << tokenize_identifier
53
+ when :instancevar
54
+ @current_line.tokens << tokenize_instancevar
55
+ when :classvar
56
+ @current_line.tokens << tokenize_classvar
57
+ when :globalvar
58
+ @current_line.tokens << tokenize_globalvar
59
+ when :delimited_literal
60
+ @current_line.tokens << tokenize_delimited_literal
61
+ when :lparen, :lsquare, :lcurly
62
+ @expr_state = :beg
63
+ @current_line.tokens << tokenize_single_character
64
+ when :comma
65
+ @expr_state = :beg
66
+ @current_line.tokens << tokenize_single_character
67
+ when :rsquare, :lcurly, :rparen
68
+ @expr_state = :end
69
+ @current_line.tokens << tokenize_single_character
70
+ when :rcurly
71
+ @current_line.tokens << tokenize_rcurly
72
+ when :tilde
73
+ default_expr_state_transition!
74
+ @current_line.tokens << tokenize_single_character
75
+ when :at, :semicolon, :backslash
76
+ @current_line.tokens << tokenize_single_character
77
+ when :question_mark
78
+ @current_line.tokens << tokenize_question_mark
79
+ when :colon
80
+ @current_line.tokens << tokenize_colon
81
+ when :period
82
+ @current_line.tokens << tokenize_period
83
+ when :plus
84
+ @current_line.tokens << tokenize_plus_operators
85
+ when :minus
86
+ @current_line.tokens << tokenize_minus_operators
87
+ when :equals
88
+ @current_line.tokens << tokenize_equals_operators
89
+ when :multiply
90
+ @current_line.tokens << tokenize_multiply_operators
91
+ when :divide
92
+ @current_line.tokens << tokenize_potential_regex
93
+ when :lt
94
+ @current_line.tokens << tokenize_lt_operators
95
+ when :gt
96
+ @current_line.tokens << tokenize_gt_operators
97
+ when :pipe
98
+ @current_line.tokens << tokenize_pipe_operators
99
+ when :ampersand
100
+ @current_line.tokens << tokenize_ampersand_operators
101
+ when :modulo
102
+ @current_line.tokens << tokenize_modulo_operators
103
+ when :caret
104
+ @current_line.tokens << tokenize_caret_operators
105
+ when :bang
106
+ @current_line.tokens << tokenize_bang_operators
107
+ when :comment
108
+ @current_line.tokens << tokenize_comment
109
+ when :single_q_string
110
+ @current_line.tokens << tokenize_single_quote_string
111
+ when :double_q_string
112
+ @current_line.tokens << tokenize_double_quote_string
113
+ when :backquote_string
114
+ @current_line.tokens << tokenize_backquote_string
115
+ when :symbol
116
+ @current_line.tokens << tokenize_symbol
117
+ when :dec_literal
118
+ @current_line.tokens << tokenize_decimal_literal
119
+ when :bin_literal, :oct_literal, :hex_literal
120
+ @current_line.tokens << tokenize_integer_literal
121
+ end
122
+ end
123
+ @sf.lines << @current_line.flatten!
124
+ end
125
+
126
+ private
127
+
128
+ def thunk
129
+ @sf.source[[@thunk_st, @length-1].min..[@thunk_end, @length-1].min]
130
+ end
131
+
132
+ def default_state_transitions!
133
+ case( cchar )
134
+ when nil then @state = :eof
135
+ when ' ' then @state = :whitespace
136
+ when "\t" then @state = :whitespace
137
+ when "\n" then @state = :newline
138
+ when '"' then @state = :double_q_string
139
+ when '\'' then @state = :single_q_string
140
+ when '`' then @state = :backquote_string
141
+ when '$' then @state = :globalvar
142
+ when '@'
143
+ if peek_ahead_for( /@/ )
144
+ @state = :classvar
145
+ elsif peek_ahead_for( /[A-Za-z_]/ )
146
+ @state = :instancevar
147
+ else
148
+ @state = :at
149
+ end
150
+ when '/' then @state = :divide
151
+ when '#' then @state = :comment
152
+ when ',' then @state = :comma
153
+ when '.' then @state = :period
154
+ when '&' then @state = :ampersand
155
+ when '!' then @state = :bang
156
+ when '~' then @state = :tilde
157
+ when '^' then @state = :caret
158
+ when '|' then @state = :pipe
159
+ when '>' then @state = :gt
160
+ when '<' then @state = :lt
161
+ when '?' then @state = :question_mark
162
+ when ';' then @state = :semicolon
163
+ when '=' then @state = :equals
164
+ when '\\' then @state = :backslash
165
+ when '%'
166
+ if @expr_state == :beg && !peek_ahead_for(/ /)
167
+ @state = :delimited_literal
168
+ else
169
+ @state = :modulo
170
+ end
171
+ when '*' then @state = :multiply
172
+ when '(' then @state = :lparen
173
+ when ')' then @state = :rparen
174
+ when '{'
175
+ @interpolating << nil
176
+ @state = :lcurly
177
+ when '}' then @state = :rcurly
178
+ when '[' then @state = :lsquare
179
+ when ']' then @state = :rsquare
180
+ when ':'
181
+ if peek_ahead_for(/[: ]/)
182
+ @state = :colon
183
+ else
184
+ @state = :symbol
185
+ end
186
+ when 'a'..'z', 'A'..'Z', '_'
187
+ @state = :identifier
188
+ when '0'
189
+ @expr_state = :end
190
+ if peek_ahead_for(/[xX]/)
191
+ @state = :hex_literal
192
+ elsif peek_ahead_for(/[bB]/)
193
+ @state = :bin_literal
194
+ elsif peek_ahead_for(/[_oO0-7]/)
195
+ @state = :oct_literal
196
+ elsif peek_ahead_for(/[89]/)
197
+ puts "Illegal Octal Digit"
198
+ elsif peek_ahead_for(/[dD]/)
199
+ @state = :dec_literal
200
+ else
201
+ @state = :dec_literal
202
+ end
203
+ when '1'..'9'
204
+ @state = :dec_literal
205
+ when '+', '-'
206
+ if peek_ahead_for( /[0-9]/ )
207
+ @state = :dec_literal
208
+ else
209
+ @state = ( cchar == '+' ? :plus : :minus )
210
+ end
211
+ end
212
+ end
213
+
214
+ # Manages the expression state to match the state machine in parse.c
215
+ def default_expr_state_transition!
216
+ if @expr_state == :fname || @expr_state == :dot
217
+ @expr_state = :arg
218
+ else
219
+ @expr_state = :beg
220
+ end
221
+ end
222
+
223
+ # Helper functions for expression state, from parse.c:9334
224
+ def is_arg
225
+ [:arg, :cmd_arg].include?( @expr_state )
226
+ end
227
+
228
+ def is_beg
229
+ [:beg, :mid, :class].include?( @expr_state )
230
+ end
231
+
232
+ # Returns the current character
233
+ def cchar
234
+ @sf.source[@i..@i]
235
+ end
236
+
237
+ # Advance the current position in the source file
238
+ def advance( num=1 )
239
+ @thunk_end += num; @i += num
240
+ end
241
+
242
+ # Resets the thunk to start at the current character
243
+ def reset_thunk!
244
+ @thunk_st = @i
245
+ @thunk_end = @i - 1
246
+ end
247
+
248
+ def peek_ahead_for( regex )
249
+ @sf.source[@i+1..@i+1] && !!regex.match( @sf.source[@i+1..@i+1] )
250
+ end
251
+
252
+ def capture_token( type )
253
+ token = Token.new( type, thunk )
254
+ reset_thunk!
255
+ default_state_transitions!
256
+ return token
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,167 @@
1
+ module Eden
2
+ module BasicTokenizer
3
+ def tokenize_single_character
4
+ @thunk_end += 1
5
+ token = Token.new(@state, thunk)
6
+ @i += 1
7
+ reset_thunk!
8
+ default_state_transitions!
9
+ return token
10
+ end
11
+
12
+ def tokenize_period
13
+ advance
14
+ if cchar == '.'
15
+ advance
16
+ @expr_state = :beg
17
+ return (advance and capture_token( :range_inc )) if cchar == '.'
18
+ capture_token( :range_exc )
19
+ else
20
+ @expr_state = :dot
21
+ capture_token( :period )
22
+ end
23
+ end
24
+
25
+ def tokenize_rcurly
26
+ @thunk_end += 1
27
+ old_state = @interpolating.pop
28
+ old_start_delimiter = @delimiters.pop
29
+ tokens = []
30
+ if old_state
31
+ tokens << Token.new(@state, thunk)
32
+ @i += 1
33
+ reset_thunk!
34
+ @state = old_state
35
+ tokens << tokenize_expanded_string( old_start_delimiter, true)
36
+ else
37
+ tokens << Token.new(@state, thunk)
38
+ @i += 1
39
+ reset_thunk!
40
+ end
41
+ default_state_transitions!
42
+ return tokens
43
+ end
44
+
45
+ # tokenizes operators beginning with a colon
46
+ def tokenize_colon
47
+ advance
48
+ if cchar == ':'
49
+ advance
50
+ if is_beg || (is_arg && @line.last_token_is_space?)
51
+ @expr_state = :beg
52
+ else
53
+ @expr_state = :dot
54
+ end
55
+ return capture_token( :scope_res )
56
+ else
57
+ @expr_state = :beg
58
+ return capture_token(:colon)
59
+ end
60
+ end
61
+
62
+ # tokenizes question mark / character literals
63
+ def tokenize_question_mark
64
+ advance
65
+ if @expr_state == :end || @expr_state == :endarg
66
+ @expr_state = :beg
67
+ return capture_token(:question_mark)
68
+ end
69
+
70
+ if (cchar != ' ' && cchar != "\t") && @i < @length
71
+ advance until cchar == ' ' || cchar == "\t" ||
72
+ cchar == "\r" || cchar == "\n" || @i > @length
73
+ return capture_token(:character_literal)
74
+ end
75
+
76
+ capture_token(:question_mark)
77
+ end
78
+
79
+ def tokenize_identifier
80
+ @expr_state = :end
81
+ advance until( /[A-Za-z0-9_]/.match( cchar ).nil? )
82
+ translate_keyword_tokens(capture_token( @state ))
83
+ end
84
+
85
+ def tokenize_whitespace
86
+ advance until( cchar != ' ' && cchar != "\t" )
87
+ capture_token( :whitespace )
88
+ end
89
+
90
+ def tokenize_comment
91
+ advance until( cchar == "\n" || cchar.nil?)
92
+ capture_token( :comment )
93
+ end
94
+
95
+ def tokenize_instancevar
96
+ @expr_state = :end
97
+ advance # Pass the @ symbol
98
+ advance until( /[a-z0-9_]/.match( cchar ).nil? )
99
+ capture_token( :instancevar )
100
+ end
101
+
102
+ def tokenize_classvar
103
+ @expr_state = :end
104
+ advance(2) # Pass the @@ symbol
105
+ advance until( /[a-z0-9_]/.match( cchar ).nil? )
106
+ capture_token( :classvar )
107
+ end
108
+
109
+ def tokenize_symbol
110
+ @expr_state = :end
111
+ advance # Pass the :
112
+ case cchar
113
+ when '"' then return tokenize_double_quote_string
114
+ when '\'' then return tokenize_single_quote_string
115
+ end
116
+ if /^(\^|&|\||<=>|==|===|!~|=~|>>|>=|<<|<=|>|<|\+|\-|\*\*|\/|%|\*|~|\+@|-@|\[\]|\[\]=)/x.match(@sf.source[@i..-1])
117
+ advance($1.length)
118
+ return capture_token(:symbol)
119
+ end
120
+ advance while( /[A-Za-z0-9_!=\?]/.match(cchar) )
121
+ capture_token( :symbol )
122
+ end
123
+
124
+ def tokenize_globalvar
125
+ @expr_state = :end
126
+ advance # Pass the $
127
+ if /[!@_\.&~0-9=\/\\\*$\?:'`]/.match( cchar )
128
+ advance and capture_token( :globalvar )
129
+ elsif /[A-Za-z0-9_]/.match( cchar )
130
+ advance while /[A-Za-z0-9_]/.match( cchar )
131
+ capture_token( :globalvar )
132
+ else
133
+ raise "Invalid Global Variable Name"
134
+ end
135
+ end
136
+
137
+ # Takes an identifier token, and tranforms its type to
138
+ # match Ruby keywords where the identifier is actually a keyword.
139
+ # Reserved words are defined in S.8.5.1 of the Ruby spec.
140
+ def translate_keyword_tokens( token )
141
+ keywords = ["__LINE__", "__ENCODING__", "__FILE__", "BEGIN",
142
+ "END", "alias", "and", "begin", "break", "case",
143
+ "class", "def", "defined?", "do", "else", "elsif",
144
+ "end", "ensure", "false", "for", "if", "in",
145
+ "module", "next", "nil", "not", "or", "redo",
146
+ "rescue", "retry", "return", "self", "super",
147
+ "then", "true", "undef", "unless", "until",
148
+ "when", "while", "yield"]
149
+ if keywords.include?( token.content )
150
+ token.type = token.content.downcase.to_sym
151
+ # Change the state if we match a keyword
152
+ @expr_state = :beg
153
+ end
154
+
155
+ # A couple of exceptions
156
+ if token.content == "BEGIN"
157
+ token.type = :begin_global
158
+ @expr_state = :beg
159
+ elsif token.content == "END"
160
+ token.type = :end_global
161
+ @expr_state = :beg
162
+ end
163
+
164
+ token
165
+ end
166
+ end
167
+ end