parslet 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/HISTORY.txt +29 -0
- data/README +2 -4
- data/Rakefile +18 -4
- data/example/comments.rb +11 -13
- data/example/documentation.rb +1 -1
- data/example/email_parser.rb +5 -5
- data/example/empty.rb +2 -2
- data/example/erb.rb +6 -3
- data/example/ip_address.rb +2 -2
- data/example/local.rb +34 -0
- data/example/minilisp.rb +2 -2
- data/example/output/comments.out +8 -0
- data/example/output/documentation.err +4 -0
- data/example/output/documentation.out +1 -0
- data/example/output/email_parser.out +2 -0
- data/example/output/empty.err +1 -0
- data/example/output/erb.out +7 -0
- data/example/output/ip_address.out +9 -0
- data/example/output/local.out +3 -0
- data/example/output/minilisp.out +5 -0
- data/example/output/parens.out +8 -0
- data/example/output/readme.out +1 -0
- data/example/output/seasons.out +28 -0
- data/example/output/simple_xml.out +2 -0
- data/example/output/string_parser.out +3 -0
- data/example/parens.rb +1 -3
- data/example/readme.rb +4 -10
- data/example/seasons.rb +2 -1
- data/example/simple_xml.rb +5 -8
- data/example/string_parser.rb +7 -5
- data/lib/parslet.rb +20 -31
- data/lib/parslet/atoms.rb +1 -0
- data/lib/parslet/atoms/base.rb +46 -87
- data/lib/parslet/atoms/dsl.rb +98 -0
- data/lib/parslet/atoms/entity.rb +3 -4
- data/lib/parslet/atoms/lookahead.rb +1 -1
- data/lib/parslet/atoms/re.rb +2 -2
- data/lib/parslet/atoms/str.rb +5 -2
- data/lib/parslet/atoms/transform.rb +75 -0
- data/lib/parslet/atoms/visitor.rb +9 -9
- data/lib/parslet/convenience.rb +3 -3
- data/lib/parslet/export.rb +13 -13
- data/lib/parslet/expression/treetop.rb +2 -2
- data/lib/parslet/parser.rb +55 -1
- data/lib/parslet/rig/rspec.rb +36 -10
- data/lib/parslet/slice.rb +172 -0
- data/lib/parslet/source.rb +72 -83
- data/lib/parslet/source/line_cache.rb +90 -0
- metadata +22 -20
@@ -0,0 +1,172 @@
|
|
1
|
+
|
2
|
+
# A slice is a small part from the parse input. A slice mainly behaves like
|
3
|
+
# any other string, except that it remembers where it came from (offset in
|
4
|
+
# original input).
|
5
|
+
#
|
6
|
+
# Some slices also know what parent slice they are a small part of. This
|
7
|
+
# allows the slice to be concatenated to other slices from the same buffer by
|
8
|
+
# reslicing it against that original buffer.
|
9
|
+
#
|
10
|
+
# Why the complexity? Slices allow retaining offset information. This will
|
11
|
+
# allow to assign line and column to each small bit of output from the parslet
|
12
|
+
# parser. Also, while we keep that information, we might as well try to do
|
13
|
+
# something useful with it. Reslicing the same buffers should in theory keep
|
14
|
+
# buffer copies and allocations down.
|
15
|
+
#
|
16
|
+
# == Extracting line and column
|
17
|
+
#
|
18
|
+
# Using the #line_and_column method, you can extract the line and column in
|
19
|
+
# the original input where this slice starts.
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
# slice.line_and_column # => [1, 13]
|
23
|
+
# slice.offset # => 12
|
24
|
+
#
|
25
|
+
# == Likeness to strings
|
26
|
+
#
|
27
|
+
# Parslet::Slice behaves in many ways like a Ruby String. This likeness
|
28
|
+
# however is not complete - many of the myriad of operations String supports
|
29
|
+
# are not yet in Slice. You can always extract the internal string instance by
|
30
|
+
# calling #to_s.
|
31
|
+
#
|
32
|
+
# These omissions are somewhat intentional. Rather than maintaining a full
|
33
|
+
# delegation, we opt for a partial emulation that gets the job done.
|
34
|
+
#
|
35
|
+
# Note also that there are some things that work with strings that will never
|
36
|
+
# work when using slices. For instance, you cannot concatenate slices that
|
37
|
+
# aren't from the same source or that don't join up:
|
38
|
+
#
|
39
|
+
# Example:
|
40
|
+
# big_slice = 'abcdef'
|
41
|
+
# a = big_slice.slice(0, 2) # => "ab"@0
|
42
|
+
# b = big_slice.slice(4, 2) # => "ef"@4
|
43
|
+
#
|
44
|
+
# a + b # raises Parslet::InvalidSliceOperation
|
45
|
+
#
|
46
|
+
# This avoids creating slices with impossible offsets or that are
|
47
|
+
# discontinous.
|
48
|
+
#
|
49
|
+
class Parslet::Slice
|
50
|
+
attr_reader :str, :offset
|
51
|
+
attr_reader :parent
|
52
|
+
attr_reader :source
|
53
|
+
|
54
|
+
def initialize(string, offset, source=nil, parent=nil)
|
55
|
+
@str, @offset = string, offset
|
56
|
+
@source = source
|
57
|
+
@parent = parent
|
58
|
+
end
|
59
|
+
|
60
|
+
# Compares slices to other slices or strings.
|
61
|
+
#
|
62
|
+
def == other
|
63
|
+
str == other
|
64
|
+
end
|
65
|
+
|
66
|
+
# Match regular expressions.
|
67
|
+
#
|
68
|
+
def match(regexp)
|
69
|
+
str.match(regexp)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns a slice that starts at offset start and that has length characters.
|
73
|
+
# Whenever possible, return parts of the parent buffer that this slice was
|
74
|
+
# cut out of.
|
75
|
+
#
|
76
|
+
def slice(start, length)
|
77
|
+
# NOTE: At a later stage, we might not want to create huge trees of slices.
|
78
|
+
# The fact that the root of the tree creates slices that link to it makes
|
79
|
+
# the tree already rather flat.
|
80
|
+
|
81
|
+
if parent
|
82
|
+
parent.slice(offset - parent.offset, length)
|
83
|
+
else
|
84
|
+
self.class.new(str.slice(start, length), offset+start, source, self)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Returns a slice that starts at file offset start and that has length
|
89
|
+
# characters in it.
|
90
|
+
#
|
91
|
+
def abs_slice(start, length)
|
92
|
+
slice(start-offset, length)
|
93
|
+
end
|
94
|
+
|
95
|
+
# True if this slice can satisfy an original input request to the
|
96
|
+
# range ofs, len.
|
97
|
+
#
|
98
|
+
def satisfies?(ofs, len)
|
99
|
+
ofs >= offset && (ofs-offset+len-1)<str.size
|
100
|
+
end
|
101
|
+
|
102
|
+
def size
|
103
|
+
str.size
|
104
|
+
end
|
105
|
+
def +(other)
|
106
|
+
raise ArgumentError,
|
107
|
+
"Cannot concat something other than a slice to a slice." \
|
108
|
+
unless other.respond_to?(:to_slice)
|
109
|
+
|
110
|
+
raise Parslet::InvalidSliceOperation,
|
111
|
+
"Cannot join slices that aren't adjacent."+
|
112
|
+
" (#{self.inspect} + #{other.inspect})" \
|
113
|
+
if offset+size != other.offset
|
114
|
+
|
115
|
+
raise Parslet::InvalidSliceOperation, "Not from the same source." \
|
116
|
+
if source != other.source
|
117
|
+
|
118
|
+
# If both slices stem from the same bigger buffer, we can reslice that
|
119
|
+
# buffer to (probably) avoid a buffer copy, as long as the strings are
|
120
|
+
# not modified.
|
121
|
+
if parent && parent == other.parent
|
122
|
+
return parent.abs_slice(offset, size+other.size)
|
123
|
+
end
|
124
|
+
|
125
|
+
self.class.new(str + other.str, offset, source)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Returns a <line, column> tuple referring to the original input.
|
129
|
+
#
|
130
|
+
def line_and_column
|
131
|
+
raise ArgumentError, "No source was given, cannot infer line and column." \
|
132
|
+
unless source
|
133
|
+
|
134
|
+
source.line_and_column(self.offset)
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
# Conversion operators -----------------------------------------------------
|
139
|
+
def to_str
|
140
|
+
str
|
141
|
+
end
|
142
|
+
alias to_s to_str
|
143
|
+
|
144
|
+
def to_slice
|
145
|
+
self
|
146
|
+
end
|
147
|
+
def to_sym
|
148
|
+
str.to_sym
|
149
|
+
end
|
150
|
+
def to_int
|
151
|
+
Integer(str)
|
152
|
+
end
|
153
|
+
def to_i
|
154
|
+
str.to_i
|
155
|
+
end
|
156
|
+
def to_f
|
157
|
+
str.to_f
|
158
|
+
end
|
159
|
+
|
160
|
+
# Inspection & Debugging ---------------------------------------------------
|
161
|
+
|
162
|
+
# Prints the slice as <code>"string"@offset</code>.
|
163
|
+
def inspect
|
164
|
+
str.inspect << "@#{offset}"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Raised when trying to do an operation on slices that cannot succeed, like
|
169
|
+
# adding non-adjacent slices. See Parslet::Slice.
|
170
|
+
#
|
171
|
+
class Parslet::InvalidSliceOperation < StandardError
|
172
|
+
end
|
data/lib/parslet/source.rb
CHANGED
@@ -1,120 +1,109 @@
|
|
1
1
|
|
2
2
|
require 'stringio'
|
3
3
|
|
4
|
+
require 'parslet/source/line_cache'
|
5
|
+
|
4
6
|
# Wraps the input IO to parslet. The interface defined by this class is
|
5
7
|
# smaller than what IO offers, but enhances it with a #column and #line
|
6
8
|
# method for the current position.
|
7
9
|
#
|
8
10
|
class Parslet::Source
|
9
|
-
attr_reader :line_ends
|
10
|
-
|
11
11
|
def initialize(io)
|
12
12
|
if io.respond_to? :to_str
|
13
13
|
io = StringIO.new(io)
|
14
14
|
end
|
15
15
|
|
16
16
|
@io = io
|
17
|
-
|
17
|
+
@virtual_position = @io.pos
|
18
|
+
@eof_position = nil
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
#
|
22
|
-
@
|
23
|
-
@line_ends.extend RangeSearch
|
20
|
+
@line_cache = LineCache.new
|
21
|
+
|
22
|
+
# Stores an array of <offset, buffer> tuples.
|
23
|
+
@slices = []
|
24
24
|
end
|
25
25
|
|
26
|
+
# Reads n chars from the input and returns a Range instance.
|
27
|
+
#
|
26
28
|
def read(n)
|
27
|
-
|
28
|
-
@
|
29
|
+
slice = read_from_cache(@virtual_position, n)
|
30
|
+
@virtual_position += slice.size
|
31
|
+
|
32
|
+
slice
|
29
33
|
end
|
30
34
|
|
31
35
|
def eof?
|
32
|
-
@
|
36
|
+
@eof_position && @virtual_position >= @eof_position
|
33
37
|
end
|
34
|
-
|
35
38
|
def pos
|
36
|
-
@
|
39
|
+
@virtual_position
|
37
40
|
end
|
38
|
-
|
39
|
-
# NOTE: If you seek beyond the point that you last read, you will get
|
40
|
-
# undefined behaviour. This is by design.
|
41
41
|
def pos=(new_pos)
|
42
|
-
@
|
43
|
-
end
|
44
|
-
|
45
|
-
def line_and_column(position=nil)
|
46
|
-
pos = (position || self.pos)
|
47
|
-
eol_idx = @line_ends.lbound(pos)
|
48
|
-
|
49
|
-
if eol_idx
|
50
|
-
# eol_idx points to the offset that ends the current line.
|
51
|
-
# Let's try to find the offset that starts it:
|
52
|
-
offset = eol_idx>0 && @line_ends[eol_idx-1] || 0
|
53
|
-
return [eol_idx+1, pos-offset+1]
|
54
|
-
else
|
55
|
-
# eol_idx is nil, that means that we're beyond the last line end that
|
56
|
-
# we know about. Pretend for now that we're just on the last line.
|
57
|
-
offset = @line_ends.last || 0
|
58
|
-
return [@line_ends.size+1, pos-offset+1]
|
59
|
-
end
|
42
|
+
@virtual_position = new_pos
|
60
43
|
end
|
61
44
|
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
# Example:
|
66
|
-
#
|
67
|
-
# [10, 20, 30]
|
68
|
-
# # would describe [0, 10], (10, 20], (20, 30]
|
45
|
+
# Returns a <line, column> tuple for the given position. If no position is
|
46
|
+
# given, line/column information is returned for the current position given
|
47
|
+
# by #pos.
|
69
48
|
#
|
70
|
-
|
71
|
-
|
72
|
-
# index of that number.
|
73
|
-
#
|
74
|
-
def lbound(bound)
|
75
|
-
return nil if empty?
|
76
|
-
return nil unless last > bound
|
77
|
-
|
78
|
-
left = 0
|
79
|
-
right = size - 1
|
80
|
-
|
81
|
-
n = 10
|
82
|
-
loop do
|
83
|
-
mid = left + (right - left) / 2
|
84
|
-
|
85
|
-
if self[mid] > bound
|
86
|
-
right = mid
|
87
|
-
else
|
88
|
-
# assert: self[mid] <= bound
|
89
|
-
left = mid+1
|
90
|
-
end
|
91
|
-
|
92
|
-
if right <= left
|
93
|
-
return right
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
49
|
+
def line_and_column(position=nil)
|
50
|
+
@line_cache.line_and_column(position || self.pos)
|
97
51
|
end
|
98
52
|
|
99
53
|
private
|
54
|
+
# Minimal size of a single read
|
55
|
+
MIN_READ_SIZE = 10 * 1024
|
56
|
+
# Number of slices to keep
|
57
|
+
BUFFER_CACHE_SIZE = 10
|
58
|
+
|
59
|
+
# Reads and returns a piece of the input that contains length chars starting
|
60
|
+
# at offset.
|
61
|
+
#
|
62
|
+
def read_from_cache(offset, length)
|
63
|
+
# Do we already have a buffer that contains the given range?
|
64
|
+
# Return that.
|
65
|
+
slice = @slices.find { |slice|
|
66
|
+
slice.satisfies?(offset, length) }
|
67
|
+
return slice.abs_slice(offset, length) if slice
|
68
|
+
|
69
|
+
# Read a new buffer: Can the demand be satisfied by sequentially reading
|
70
|
+
# from the current position?
|
71
|
+
needed = offset-@io.pos+length
|
72
|
+
if @io.pos <= offset && needed<MIN_READ_SIZE
|
73
|
+
# read the slice
|
74
|
+
slice = read_slice(needed)
|
75
|
+
return slice.abs_slice(offset, length)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Otherwise seek and read enough so that we can satisfy the demand.
|
79
|
+
@io.pos = offset
|
100
80
|
|
101
|
-
|
102
|
-
return
|
103
|
-
|
104
|
-
cur = -1
|
81
|
+
slice = read_slice(needed)
|
82
|
+
return slice.abs_slice(offset, length)
|
83
|
+
end
|
105
84
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
85
|
+
def read_slice(needed)
|
86
|
+
start = @io.pos
|
87
|
+
request = [MIN_READ_SIZE, needed].max
|
88
|
+
buf = @io.read(request)
|
89
|
+
|
90
|
+
# remember eof position
|
91
|
+
if !buf || buf.size<request
|
92
|
+
@eof_position = @io.pos
|
111
93
|
end
|
94
|
+
|
95
|
+
# cache line ends
|
96
|
+
@line_cache.scan_for_line_endings(start, buf)
|
97
|
+
|
98
|
+
slice = Parslet::Slice.new(buf || '', start, self)
|
99
|
+
|
100
|
+
# Don't cache empty slices.
|
101
|
+
return slice unless buf
|
102
|
+
|
103
|
+
# cache the buffer (and eject old entries)
|
104
|
+
@slices << slice
|
105
|
+
@slices.shift if @slices.size > BUFFER_CACHE_SIZE
|
112
106
|
|
113
|
-
|
114
|
-
# in @line_ends.
|
115
|
-
while buf && cur = buf.index("\n", cur+1)
|
116
|
-
@last_line_end = (start_pos + cur+1)
|
117
|
-
@line_ends << @last_line_end
|
118
|
-
end
|
107
|
+
slice
|
119
108
|
end
|
120
109
|
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class Parslet::Source
|
4
|
+
# A cache for line start positions.
|
5
|
+
#
|
6
|
+
class LineCache # :nodoc:
|
7
|
+
def initialize
|
8
|
+
# Stores line endings as a simple position number. The first line always
|
9
|
+
# starts at 0; numbers beyond the biggest entry are on any line > size,
|
10
|
+
# but probably make a scan to that position neccessary.
|
11
|
+
@line_ends = []
|
12
|
+
@line_ends.extend RangeSearch
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a <line, column> tuple for the given input position.
|
16
|
+
#
|
17
|
+
def line_and_column(pos)
|
18
|
+
eol_idx = @line_ends.lbound(pos)
|
19
|
+
|
20
|
+
if eol_idx
|
21
|
+
# eol_idx points to the offset that ends the current line.
|
22
|
+
# Let's try to find the offset that starts it:
|
23
|
+
offset = eol_idx>0 && @line_ends[eol_idx-1] || 0
|
24
|
+
return [eol_idx+1, pos-offset+1]
|
25
|
+
else
|
26
|
+
# eol_idx is nil, that means that we're beyond the last line end that
|
27
|
+
# we know about. Pretend for now that we're just on the last line.
|
28
|
+
offset = @line_ends.last || 0
|
29
|
+
return [@line_ends.size+1, pos-offset+1]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def scan_for_line_endings(start_pos, buf)
|
34
|
+
return unless buf
|
35
|
+
return unless buf.index("\n")
|
36
|
+
cur = -1
|
37
|
+
|
38
|
+
# If we have already read part or all of buf, we already know about
|
39
|
+
# line ends in that portion. remove it and correct cur (search index)
|
40
|
+
if @last_line_end && start_pos < @last_line_end
|
41
|
+
# Let's not search the range from start_pos to last_line_end again.
|
42
|
+
cur = @last_line_end - start_pos -1
|
43
|
+
end
|
44
|
+
|
45
|
+
# Scan the string for line endings; store the positions of all endings
|
46
|
+
# in @line_ends.
|
47
|
+
while buf && cur = buf.index("\n", cur+1)
|
48
|
+
@last_line_end = (start_pos + cur+1)
|
49
|
+
@line_ends << @last_line_end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Mixin for arrays that implicitly give a number of ranges, where one range
|
55
|
+
# begins where the other one ends.
|
56
|
+
#
|
57
|
+
# Example:
|
58
|
+
#
|
59
|
+
# [10, 20, 30]
|
60
|
+
# # would describe [0, 10], (10, 20], (20, 30]
|
61
|
+
#
|
62
|
+
module RangeSearch # :nodoc:
|
63
|
+
# Scans the array for the first number that is > than bound. Returns the
|
64
|
+
# index of that number.
|
65
|
+
#
|
66
|
+
def lbound(bound)
|
67
|
+
return nil if empty?
|
68
|
+
return nil unless last > bound
|
69
|
+
|
70
|
+
left = 0
|
71
|
+
right = size - 1
|
72
|
+
|
73
|
+
n = 10
|
74
|
+
loop do
|
75
|
+
mid = left + (right - left) / 2
|
76
|
+
|
77
|
+
if self[mid] > bound
|
78
|
+
right = mid
|
79
|
+
else
|
80
|
+
# assert: self[mid] <= bound
|
81
|
+
left = mid+1
|
82
|
+
end
|
83
|
+
|
84
|
+
if right <= left
|
85
|
+
return right
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|