parslet 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HISTORY.txt +29 -0
- data/README +2 -4
- data/Rakefile +18 -4
- data/example/comments.rb +11 -13
- data/example/documentation.rb +1 -1
- data/example/email_parser.rb +5 -5
- data/example/empty.rb +2 -2
- data/example/erb.rb +6 -3
- data/example/ip_address.rb +2 -2
- data/example/local.rb +34 -0
- data/example/minilisp.rb +2 -2
- data/example/output/comments.out +8 -0
- data/example/output/documentation.err +4 -0
- data/example/output/documentation.out +1 -0
- data/example/output/email_parser.out +2 -0
- data/example/output/empty.err +1 -0
- data/example/output/erb.out +7 -0
- data/example/output/ip_address.out +9 -0
- data/example/output/local.out +3 -0
- data/example/output/minilisp.out +5 -0
- data/example/output/parens.out +8 -0
- data/example/output/readme.out +1 -0
- data/example/output/seasons.out +28 -0
- data/example/output/simple_xml.out +2 -0
- data/example/output/string_parser.out +3 -0
- data/example/parens.rb +1 -3
- data/example/readme.rb +4 -10
- data/example/seasons.rb +2 -1
- data/example/simple_xml.rb +5 -8
- data/example/string_parser.rb +7 -5
- data/lib/parslet.rb +20 -31
- data/lib/parslet/atoms.rb +1 -0
- data/lib/parslet/atoms/base.rb +46 -87
- data/lib/parslet/atoms/dsl.rb +98 -0
- data/lib/parslet/atoms/entity.rb +3 -4
- data/lib/parslet/atoms/lookahead.rb +1 -1
- data/lib/parslet/atoms/re.rb +2 -2
- data/lib/parslet/atoms/str.rb +5 -2
- data/lib/parslet/atoms/transform.rb +75 -0
- data/lib/parslet/atoms/visitor.rb +9 -9
- data/lib/parslet/convenience.rb +3 -3
- data/lib/parslet/export.rb +13 -13
- data/lib/parslet/expression/treetop.rb +2 -2
- data/lib/parslet/parser.rb +55 -1
- data/lib/parslet/rig/rspec.rb +36 -10
- data/lib/parslet/slice.rb +172 -0
- data/lib/parslet/source.rb +72 -83
- data/lib/parslet/source/line_cache.rb +90 -0
- metadata +22 -20
@@ -0,0 +1,172 @@
|
|
1
|
+
|
2
|
+
# A slice is a small part from the parse input. A slice mainly behaves like
|
3
|
+
# any other string, except that it remembers where it came from (offset in
|
4
|
+
# original input).
|
5
|
+
#
|
6
|
+
# Some slices also know what parent slice they are a small part of. This
|
7
|
+
# allows the slice to be concatenated to other slices from the same buffer by
|
8
|
+
# reslicing it against that original buffer.
|
9
|
+
#
|
10
|
+
# Why the complexity? Slices allow retaining offset information. This will
|
11
|
+
# allow to assign line and column to each small bit of output from the parslet
|
12
|
+
# parser. Also, while we keep that information, we might as well try to do
|
13
|
+
# something useful with it. Reslicing the same buffers should in theory keep
|
14
|
+
# buffer copies and allocations down.
|
15
|
+
#
|
16
|
+
# == Extracting line and column
|
17
|
+
#
|
18
|
+
# Using the #line_and_column method, you can extract the line and column in
|
19
|
+
# the original input where this slice starts.
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
# slice.line_and_column # => [1, 13]
|
23
|
+
# slice.offset # => 12
|
24
|
+
#
|
25
|
+
# == Likeness to strings
|
26
|
+
#
|
27
|
+
# Parslet::Slice behaves in many ways like a Ruby String. This likeness
|
28
|
+
# however is not complete - many of the myriad of operations String supports
|
29
|
+
# are not yet in Slice. You can always extract the internal string instance by
|
30
|
+
# calling #to_s.
|
31
|
+
#
|
32
|
+
# These omissions are somewhat intentional. Rather than maintaining a full
|
33
|
+
# delegation, we opt for a partial emulation that gets the job done.
|
34
|
+
#
|
35
|
+
# Note also that there are some things that work with strings that will never
|
36
|
+
# work when using slices. For instance, you cannot concatenate slices that
|
37
|
+
# aren't from the same source or that don't join up:
|
38
|
+
#
|
39
|
+
# Example:
|
40
|
+
# big_slice = 'abcdef'
|
41
|
+
# a = big_slice.slice(0, 2) # => "ab"@0
|
42
|
+
# b = big_slice.slice(4, 2) # => "ef"@4
|
43
|
+
#
|
44
|
+
# a + b # raises Parslet::InvalidSliceOperation
|
45
|
+
#
|
46
|
+
# This avoids creating slices with impossible offsets or that are
|
47
|
+
# discontinous.
|
48
|
+
#
|
49
|
+
class Parslet::Slice
|
50
|
+
attr_reader :str, :offset
|
51
|
+
attr_reader :parent
|
52
|
+
attr_reader :source
|
53
|
+
|
54
|
+
def initialize(string, offset, source=nil, parent=nil)
|
55
|
+
@str, @offset = string, offset
|
56
|
+
@source = source
|
57
|
+
@parent = parent
|
58
|
+
end
|
59
|
+
|
60
|
+
# Compares slices to other slices or strings.
|
61
|
+
#
|
62
|
+
def == other
|
63
|
+
str == other
|
64
|
+
end
|
65
|
+
|
66
|
+
# Match regular expressions.
|
67
|
+
#
|
68
|
+
def match(regexp)
|
69
|
+
str.match(regexp)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns a slice that starts at offset start and that has length characters.
|
73
|
+
# Whenever possible, return parts of the parent buffer that this slice was
|
74
|
+
# cut out of.
|
75
|
+
#
|
76
|
+
def slice(start, length)
|
77
|
+
# NOTE: At a later stage, we might not want to create huge trees of slices.
|
78
|
+
# The fact that the root of the tree creates slices that link to it makes
|
79
|
+
# the tree already rather flat.
|
80
|
+
|
81
|
+
if parent
|
82
|
+
parent.slice(offset - parent.offset, length)
|
83
|
+
else
|
84
|
+
self.class.new(str.slice(start, length), offset+start, source, self)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Returns a slice that starts at file offset start and that has length
|
89
|
+
# characters in it.
|
90
|
+
#
|
91
|
+
def abs_slice(start, length)
|
92
|
+
slice(start-offset, length)
|
93
|
+
end
|
94
|
+
|
95
|
+
# True if this slice can satisfy an original input request to the
|
96
|
+
# range ofs, len.
|
97
|
+
#
|
98
|
+
def satisfies?(ofs, len)
|
99
|
+
ofs >= offset && (ofs-offset+len-1)<str.size
|
100
|
+
end
|
101
|
+
|
102
|
+
def size
|
103
|
+
str.size
|
104
|
+
end
|
105
|
+
def +(other)
|
106
|
+
raise ArgumentError,
|
107
|
+
"Cannot concat something other than a slice to a slice." \
|
108
|
+
unless other.respond_to?(:to_slice)
|
109
|
+
|
110
|
+
raise Parslet::InvalidSliceOperation,
|
111
|
+
"Cannot join slices that aren't adjacent."+
|
112
|
+
" (#{self.inspect} + #{other.inspect})" \
|
113
|
+
if offset+size != other.offset
|
114
|
+
|
115
|
+
raise Parslet::InvalidSliceOperation, "Not from the same source." \
|
116
|
+
if source != other.source
|
117
|
+
|
118
|
+
# If both slices stem from the same bigger buffer, we can reslice that
|
119
|
+
# buffer to (probably) avoid a buffer copy, as long as the strings are
|
120
|
+
# not modified.
|
121
|
+
if parent && parent == other.parent
|
122
|
+
return parent.abs_slice(offset, size+other.size)
|
123
|
+
end
|
124
|
+
|
125
|
+
self.class.new(str + other.str, offset, source)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Returns a <line, column> tuple referring to the original input.
|
129
|
+
#
|
130
|
+
def line_and_column
|
131
|
+
raise ArgumentError, "No source was given, cannot infer line and column." \
|
132
|
+
unless source
|
133
|
+
|
134
|
+
source.line_and_column(self.offset)
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
# Conversion operators -----------------------------------------------------
|
139
|
+
def to_str
|
140
|
+
str
|
141
|
+
end
|
142
|
+
alias to_s to_str
|
143
|
+
|
144
|
+
def to_slice
|
145
|
+
self
|
146
|
+
end
|
147
|
+
def to_sym
|
148
|
+
str.to_sym
|
149
|
+
end
|
150
|
+
def to_int
|
151
|
+
Integer(str)
|
152
|
+
end
|
153
|
+
def to_i
|
154
|
+
str.to_i
|
155
|
+
end
|
156
|
+
def to_f
|
157
|
+
str.to_f
|
158
|
+
end
|
159
|
+
|
160
|
+
# Inspection & Debugging ---------------------------------------------------
|
161
|
+
|
162
|
+
# Prints the slice as <code>"string"@offset</code>.
|
163
|
+
def inspect
|
164
|
+
str.inspect << "@#{offset}"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Raised when trying to do an operation on slices that cannot succeed, like
|
169
|
+
# adding non-adjacent slices. See Parslet::Slice.
|
170
|
+
#
|
171
|
+
class Parslet::InvalidSliceOperation < StandardError
|
172
|
+
end
|
data/lib/parslet/source.rb
CHANGED
@@ -1,120 +1,109 @@
|
|
1
1
|
|
2
2
|
require 'stringio'
|
3
3
|
|
4
|
+
require 'parslet/source/line_cache'
|
5
|
+
|
4
6
|
# Wraps the input IO to parslet. The interface defined by this class is
|
5
7
|
# smaller than what IO offers, but enhances it with a #column and #line
|
6
8
|
# method for the current position.
|
7
9
|
#
|
8
10
|
class Parslet::Source
|
9
|
-
attr_reader :line_ends
|
10
|
-
|
11
11
|
def initialize(io)
|
12
12
|
if io.respond_to? :to_str
|
13
13
|
io = StringIO.new(io)
|
14
14
|
end
|
15
15
|
|
16
16
|
@io = io
|
17
|
-
|
17
|
+
@virtual_position = @io.pos
|
18
|
+
@eof_position = nil
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
#
|
22
|
-
@
|
23
|
-
@line_ends.extend RangeSearch
|
20
|
+
@line_cache = LineCache.new
|
21
|
+
|
22
|
+
# Stores an array of <offset, buffer> tuples.
|
23
|
+
@slices = []
|
24
24
|
end
|
25
25
|
|
26
|
+
# Reads n chars from the input and returns a Range instance.
|
27
|
+
#
|
26
28
|
def read(n)
|
27
|
-
|
28
|
-
@
|
29
|
+
slice = read_from_cache(@virtual_position, n)
|
30
|
+
@virtual_position += slice.size
|
31
|
+
|
32
|
+
slice
|
29
33
|
end
|
30
34
|
|
31
35
|
def eof?
|
32
|
-
@
|
36
|
+
@eof_position && @virtual_position >= @eof_position
|
33
37
|
end
|
34
|
-
|
35
38
|
def pos
|
36
|
-
@
|
39
|
+
@virtual_position
|
37
40
|
end
|
38
|
-
|
39
|
-
# NOTE: If you seek beyond the point that you last read, you will get
|
40
|
-
# undefined behaviour. This is by design.
|
41
41
|
def pos=(new_pos)
|
42
|
-
@
|
43
|
-
end
|
44
|
-
|
45
|
-
def line_and_column(position=nil)
|
46
|
-
pos = (position || self.pos)
|
47
|
-
eol_idx = @line_ends.lbound(pos)
|
48
|
-
|
49
|
-
if eol_idx
|
50
|
-
# eol_idx points to the offset that ends the current line.
|
51
|
-
# Let's try to find the offset that starts it:
|
52
|
-
offset = eol_idx>0 && @line_ends[eol_idx-1] || 0
|
53
|
-
return [eol_idx+1, pos-offset+1]
|
54
|
-
else
|
55
|
-
# eol_idx is nil, that means that we're beyond the last line end that
|
56
|
-
# we know about. Pretend for now that we're just on the last line.
|
57
|
-
offset = @line_ends.last || 0
|
58
|
-
return [@line_ends.size+1, pos-offset+1]
|
59
|
-
end
|
42
|
+
@virtual_position = new_pos
|
60
43
|
end
|
61
44
|
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
# Example:
|
66
|
-
#
|
67
|
-
# [10, 20, 30]
|
68
|
-
# # would describe [0, 10], (10, 20], (20, 30]
|
45
|
+
# Returns a <line, column> tuple for the given position. If no position is
|
46
|
+
# given, line/column information is returned for the current position given
|
47
|
+
# by #pos.
|
69
48
|
#
|
70
|
-
|
71
|
-
|
72
|
-
# index of that number.
|
73
|
-
#
|
74
|
-
def lbound(bound)
|
75
|
-
return nil if empty?
|
76
|
-
return nil unless last > bound
|
77
|
-
|
78
|
-
left = 0
|
79
|
-
right = size - 1
|
80
|
-
|
81
|
-
n = 10
|
82
|
-
loop do
|
83
|
-
mid = left + (right - left) / 2
|
84
|
-
|
85
|
-
if self[mid] > bound
|
86
|
-
right = mid
|
87
|
-
else
|
88
|
-
# assert: self[mid] <= bound
|
89
|
-
left = mid+1
|
90
|
-
end
|
91
|
-
|
92
|
-
if right <= left
|
93
|
-
return right
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
49
|
+
def line_and_column(position=nil)
|
50
|
+
@line_cache.line_and_column(position || self.pos)
|
97
51
|
end
|
98
52
|
|
99
53
|
private
|
54
|
+
# Minimal size of a single read
|
55
|
+
MIN_READ_SIZE = 10 * 1024
|
56
|
+
# Number of slices to keep
|
57
|
+
BUFFER_CACHE_SIZE = 10
|
58
|
+
|
59
|
+
# Reads and returns a piece of the input that contains length chars starting
|
60
|
+
# at offset.
|
61
|
+
#
|
62
|
+
def read_from_cache(offset, length)
|
63
|
+
# Do we already have a buffer that contains the given range?
|
64
|
+
# Return that.
|
65
|
+
slice = @slices.find { |slice|
|
66
|
+
slice.satisfies?(offset, length) }
|
67
|
+
return slice.abs_slice(offset, length) if slice
|
68
|
+
|
69
|
+
# Read a new buffer: Can the demand be satisfied by sequentially reading
|
70
|
+
# from the current position?
|
71
|
+
needed = offset-@io.pos+length
|
72
|
+
if @io.pos <= offset && needed<MIN_READ_SIZE
|
73
|
+
# read the slice
|
74
|
+
slice = read_slice(needed)
|
75
|
+
return slice.abs_slice(offset, length)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Otherwise seek and read enough so that we can satisfy the demand.
|
79
|
+
@io.pos = offset
|
100
80
|
|
101
|
-
|
102
|
-
return
|
103
|
-
|
104
|
-
cur = -1
|
81
|
+
slice = read_slice(needed)
|
82
|
+
return slice.abs_slice(offset, length)
|
83
|
+
end
|
105
84
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
85
|
+
def read_slice(needed)
|
86
|
+
start = @io.pos
|
87
|
+
request = [MIN_READ_SIZE, needed].max
|
88
|
+
buf = @io.read(request)
|
89
|
+
|
90
|
+
# remember eof position
|
91
|
+
if !buf || buf.size<request
|
92
|
+
@eof_position = @io.pos
|
111
93
|
end
|
94
|
+
|
95
|
+
# cache line ends
|
96
|
+
@line_cache.scan_for_line_endings(start, buf)
|
97
|
+
|
98
|
+
slice = Parslet::Slice.new(buf || '', start, self)
|
99
|
+
|
100
|
+
# Don't cache empty slices.
|
101
|
+
return slice unless buf
|
102
|
+
|
103
|
+
# cache the buffer (and eject old entries)
|
104
|
+
@slices << slice
|
105
|
+
@slices.shift if @slices.size > BUFFER_CACHE_SIZE
|
112
106
|
|
113
|
-
|
114
|
-
# in @line_ends.
|
115
|
-
while buf && cur = buf.index("\n", cur+1)
|
116
|
-
@last_line_end = (start_pos + cur+1)
|
117
|
-
@line_ends << @last_line_end
|
118
|
-
end
|
107
|
+
slice
|
119
108
|
end
|
120
109
|
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class Parslet::Source
|
4
|
+
# A cache for line start positions.
|
5
|
+
#
|
6
|
+
class LineCache # :nodoc:
|
7
|
+
def initialize
|
8
|
+
# Stores line endings as a simple position number. The first line always
|
9
|
+
# starts at 0; numbers beyond the biggest entry are on any line > size,
|
10
|
+
# but probably make a scan to that position neccessary.
|
11
|
+
@line_ends = []
|
12
|
+
@line_ends.extend RangeSearch
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a <line, column> tuple for the given input position.
|
16
|
+
#
|
17
|
+
def line_and_column(pos)
|
18
|
+
eol_idx = @line_ends.lbound(pos)
|
19
|
+
|
20
|
+
if eol_idx
|
21
|
+
# eol_idx points to the offset that ends the current line.
|
22
|
+
# Let's try to find the offset that starts it:
|
23
|
+
offset = eol_idx>0 && @line_ends[eol_idx-1] || 0
|
24
|
+
return [eol_idx+1, pos-offset+1]
|
25
|
+
else
|
26
|
+
# eol_idx is nil, that means that we're beyond the last line end that
|
27
|
+
# we know about. Pretend for now that we're just on the last line.
|
28
|
+
offset = @line_ends.last || 0
|
29
|
+
return [@line_ends.size+1, pos-offset+1]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def scan_for_line_endings(start_pos, buf)
|
34
|
+
return unless buf
|
35
|
+
return unless buf.index("\n")
|
36
|
+
cur = -1
|
37
|
+
|
38
|
+
# If we have already read part or all of buf, we already know about
|
39
|
+
# line ends in that portion. remove it and correct cur (search index)
|
40
|
+
if @last_line_end && start_pos < @last_line_end
|
41
|
+
# Let's not search the range from start_pos to last_line_end again.
|
42
|
+
cur = @last_line_end - start_pos -1
|
43
|
+
end
|
44
|
+
|
45
|
+
# Scan the string for line endings; store the positions of all endings
|
46
|
+
# in @line_ends.
|
47
|
+
while buf && cur = buf.index("\n", cur+1)
|
48
|
+
@last_line_end = (start_pos + cur+1)
|
49
|
+
@line_ends << @last_line_end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Mixin for arrays that implicitly give a number of ranges, where one range
|
55
|
+
# begins where the other one ends.
|
56
|
+
#
|
57
|
+
# Example:
|
58
|
+
#
|
59
|
+
# [10, 20, 30]
|
60
|
+
# # would describe [0, 10], (10, 20], (20, 30]
|
61
|
+
#
|
62
|
+
module RangeSearch # :nodoc:
|
63
|
+
# Scans the array for the first number that is > than bound. Returns the
|
64
|
+
# index of that number.
|
65
|
+
#
|
66
|
+
def lbound(bound)
|
67
|
+
return nil if empty?
|
68
|
+
return nil unless last > bound
|
69
|
+
|
70
|
+
left = 0
|
71
|
+
right = size - 1
|
72
|
+
|
73
|
+
n = 10
|
74
|
+
loop do
|
75
|
+
mid = left + (right - left) / 2
|
76
|
+
|
77
|
+
if self[mid] > bound
|
78
|
+
right = mid
|
79
|
+
else
|
80
|
+
# assert: self[mid] <= bound
|
81
|
+
left = mid+1
|
82
|
+
end
|
83
|
+
|
84
|
+
if right <= left
|
85
|
+
return right
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|