rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,401 @@
|
|
1
|
+
require 'rpdf2txt-rockit/base_extensions'
|
2
|
+
require 'rpdf2txt-rockit/graphviz_dot'
|
3
|
+
|
4
|
+
class HashOfHash < DefaultInitHash
|
5
|
+
def initialize(&initBlock)
|
6
|
+
super do
|
7
|
+
if initBlock
|
8
|
+
DefaultInitHash.new(&initBlock)
|
9
|
+
else
|
10
|
+
Hash.new
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
GraphLink = Struct.new("GraphLink", :from, :to, :info)
|
17
|
+
class GraphLink
|
18
|
+
def inspect
|
19
|
+
info_str = info ? info.inspect + "-" : ""
|
20
|
+
"#{from.inspect}-#{info_str}>#{to.inspect}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class GraphTraversalException < Exception
|
25
|
+
attr_reader :node, :links, :link_info
|
26
|
+
def initialize(node, links, linkInfo)
|
27
|
+
@node, @links, @link_info = node, links, linkInfo
|
28
|
+
super(message)
|
29
|
+
end
|
30
|
+
|
31
|
+
def message
|
32
|
+
"There is no link from #{@node.inspect} having info #{@link_info.inspect} (valid links are #{@links.inspect})"
|
33
|
+
end
|
34
|
+
alias inspect message
|
35
|
+
end
|
36
|
+
|
37
|
+
class DirectedGraph
|
38
|
+
# This is a memory expensive variant that manages several additional
|
39
|
+
# information data structures to cut down on processing when the graph
|
40
|
+
# has been built.
|
41
|
+
|
42
|
+
attr_reader :links
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
@link_map = HashOfHash.new {Array.new} # [from][to] -> array of links
|
46
|
+
@links = Array.new # All links in one array
|
47
|
+
@is_root = Hash.new # true iff root node
|
48
|
+
@is_leaf = Hash.new # true iff leaf node
|
49
|
+
end
|
50
|
+
|
51
|
+
def nodes
|
52
|
+
@is_root.keys
|
53
|
+
end
|
54
|
+
|
55
|
+
def add_node(node)
|
56
|
+
unless include_node?(node)
|
57
|
+
@is_root[node] = @is_leaf[node] = true
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def root?(node)
|
62
|
+
@is_root[node]
|
63
|
+
end
|
64
|
+
|
65
|
+
def leaf?(node)
|
66
|
+
@is_leaf[node]
|
67
|
+
end
|
68
|
+
|
69
|
+
def include_node?(node)
|
70
|
+
@is_root.has_key?(node)
|
71
|
+
end
|
72
|
+
|
73
|
+
def links_from_to(from, to)
|
74
|
+
@link_map[from][to]
|
75
|
+
end
|
76
|
+
|
77
|
+
def links_from(node)
|
78
|
+
@link_map[node].map {|to, links| links}.flatten
|
79
|
+
end
|
80
|
+
|
81
|
+
def children(node)
|
82
|
+
@link_map[node].keys.select {|k| @link_map[node][k].length > 0}
|
83
|
+
end
|
84
|
+
|
85
|
+
# (Forced) add link will always add link even if there are already links
|
86
|
+
# between the nodes.
|
87
|
+
def add_link(from, to, informationOnLink = nil)
|
88
|
+
add_link_nodes(from, to)
|
89
|
+
link = GraphLink.new(from, to, informationOnLink)
|
90
|
+
links_from_to(from, to).push link
|
91
|
+
add_to_links(link)
|
92
|
+
link
|
93
|
+
end
|
94
|
+
|
95
|
+
def add_link_nodes(from, to)
|
96
|
+
add_node(from)
|
97
|
+
add_node(to)
|
98
|
+
@is_leaf[from] = @is_root[to] = false
|
99
|
+
end
|
100
|
+
|
101
|
+
# Add link if not already linked
|
102
|
+
def link_nodes(from, to, info = nil)
|
103
|
+
links_from_to?(from, to) ? nil : add_link(from, to, info)
|
104
|
+
end
|
105
|
+
|
106
|
+
def links_from_to?(from, to)
|
107
|
+
not links_from_to(from, to).empty?
|
108
|
+
end
|
109
|
+
alias linked? links_from_to?
|
110
|
+
|
111
|
+
def add_to_links(link)
|
112
|
+
@links.push link
|
113
|
+
end
|
114
|
+
private :add_to_links
|
115
|
+
|
116
|
+
def each_reachable_node_once_depth_first(node, inclusive = true, &block)
|
117
|
+
children(node).each do |c|
|
118
|
+
recurse_each_reachable_depth_first_visited(c, Hash.new, &block)
|
119
|
+
end
|
120
|
+
block.call(node) if inclusive
|
121
|
+
end
|
122
|
+
alias each_reachable_node each_reachable_node_once_depth_first
|
123
|
+
|
124
|
+
def recurse_each_reachable_depth_first_visited(node, visited, &block)
|
125
|
+
visited[node] = true
|
126
|
+
children(node).each do |c|
|
127
|
+
unless visited[c]
|
128
|
+
recurse_each_reachable_depth_first_visited(c, visited, &block)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
block.call(node)
|
132
|
+
end
|
133
|
+
|
134
|
+
def each_reachable_node_once_breadth_first(node, inclusive = true, &block)
|
135
|
+
block.call(node) if inclusive
|
136
|
+
children(node).each do |c|
|
137
|
+
recurse_each_reachable_breadth_first_visited(c, Hash.new, &block)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
alias each_reachable_node each_reachable_node_once_depth_first
|
141
|
+
|
142
|
+
def recurse_each_reachable_breadth_first_visited(node, visited, &block)
|
143
|
+
visited[node] = true
|
144
|
+
block.call(node)
|
145
|
+
children(node).each do |c|
|
146
|
+
unless visited[c]
|
147
|
+
recurse_each_reachable_breadth_first_visited(c, visited, &block)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def root_nodes
|
153
|
+
@is_root.reject {|key,val| val == false}.keys
|
154
|
+
end
|
155
|
+
alias_method :roots, :root_nodes
|
156
|
+
|
157
|
+
def leaf_nodes
|
158
|
+
@is_leaf.reject {|key,val| val == false}.keys
|
159
|
+
end
|
160
|
+
alias_method :leafs, :leaf_nodes
|
161
|
+
|
162
|
+
def internal_node?(node)
|
163
|
+
!root?(node) and !leaf?(node)
|
164
|
+
end
|
165
|
+
|
166
|
+
def internal_nodes
|
167
|
+
nodes.reject {|n| root?(n) or leaf?(n)}
|
168
|
+
end
|
169
|
+
|
170
|
+
def recurse_cyclic?(node, visited)
|
171
|
+
visited[node] = true
|
172
|
+
children(node).each do |c|
|
173
|
+
return true if visited[c] || recurse_cyclic?(c, visited)
|
174
|
+
end
|
175
|
+
false
|
176
|
+
end
|
177
|
+
|
178
|
+
def cyclic?
|
179
|
+
visited = Hash.new
|
180
|
+
root_nodes.each {|root| return true if recurse_cyclic?(root, visited)}
|
181
|
+
false
|
182
|
+
end
|
183
|
+
|
184
|
+
def acyclic?
|
185
|
+
not cyclic?
|
186
|
+
end
|
187
|
+
|
188
|
+
def transition(state, linkInfo)
|
189
|
+
link = links_from(state).detect {|l| l.info == linkInfo}
|
190
|
+
begin
|
191
|
+
link.to
|
192
|
+
rescue Exception
|
193
|
+
raise GraphTraversalException.new(state, links_from(state), linkInfo)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def traverse(fromState, alongLinksWithInfo = [])
|
198
|
+
state, len = fromState, alongLinksWithInfo.length
|
199
|
+
alongLinksWithInfo = alongLinksWithInfo.clone
|
200
|
+
while len > 0
|
201
|
+
state = transition(state, alongLinksWithInfo.shift)
|
202
|
+
len -= 1
|
203
|
+
end
|
204
|
+
state
|
205
|
+
end
|
206
|
+
|
207
|
+
def to_dot(nodeShaper = nil, nodeLabeler = nil, linkLabeler = nil)
|
208
|
+
f = DotGraphFormatter.new(nodeShaper, nodeLabeler, linkLabeler)
|
209
|
+
f.format(nodes, links)
|
210
|
+
end
|
211
|
+
|
212
|
+
def to_postscript_file(filename, nodeShaper = nil, nodeLabeler = nil,
|
213
|
+
linkLabeler = nil)
|
214
|
+
to_dot(nodeShaper, nodeLabeler, linkLabeler).write_to_file(filename)
|
215
|
+
end
|
216
|
+
|
217
|
+
# Floyd-Warshal algorithm which should be O(n^3) where n is the number of
|
218
|
+
# nodes. We can probably work a bit on the constant factors!
|
219
|
+
def transitive_closure_floyd_warshal
|
220
|
+
vertices = nodes
|
221
|
+
tcg = DirectedGraph.new
|
222
|
+
num_nodes = vertices.length
|
223
|
+
|
224
|
+
# Direct links
|
225
|
+
for k in (0...num_nodes)
|
226
|
+
for s in (0...num_nodes)
|
227
|
+
vk, vs = vertices[k], vertices[s]
|
228
|
+
if vk == vs
|
229
|
+
tcg.link_nodes(vk,vs)
|
230
|
+
elsif linked?(vk, vs)
|
231
|
+
tcg.link_nodes(vk,vs)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
# Indirect links
|
237
|
+
for i in (0...num_nodes)
|
238
|
+
for j in (0...num_nodes)
|
239
|
+
for k in (0...num_nodes)
|
240
|
+
vi, vj, vk = vertices[i], vertices[j], vertices[k]
|
241
|
+
if not tcg.linked?(vi,vj)
|
242
|
+
tcg.link_nodes(vi, vj) if linked?(vi,vk) and linked?(vk,vj)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
tcg
|
248
|
+
end
|
249
|
+
alias_method :transitive_closure, :transitive_closure_floyd_warshal
|
250
|
+
end
|
251
|
+
|
252
|
+
# Parallel propagation in directed acyclic graphs. Should be faster than
|
253
|
+
# traversing all links from each start node if the graph is dense so that
|
254
|
+
# many traversals can be merged.
|
255
|
+
class DagPropagator
|
256
|
+
def initialize(directedGraph, startNodes, &propagationBlock)
|
257
|
+
@graph, @block = directedGraph, propagationBlock
|
258
|
+
init_start_nodes(startNodes)
|
259
|
+
@visited = Hash.new
|
260
|
+
end
|
261
|
+
|
262
|
+
def init_start_nodes(startNodes)
|
263
|
+
@startnodes = startNodes
|
264
|
+
end
|
265
|
+
|
266
|
+
def propagate
|
267
|
+
@visited.clear
|
268
|
+
propagate_recursive
|
269
|
+
end
|
270
|
+
|
271
|
+
def propagate_recursive
|
272
|
+
next_start_nodes = Array.new
|
273
|
+
@startnodes.each do |parent|
|
274
|
+
@visited[parent] = true
|
275
|
+
@graph.children(parent).each do |child|
|
276
|
+
@block.call(parent, child)
|
277
|
+
unless @visited[child] or next_start_nodes.include?(child)
|
278
|
+
next_start_nodes.push(child)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
if next_start_nodes.length > 0
|
283
|
+
@startnodes = next_start_nodes
|
284
|
+
propagate_recursive
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# Directed graph with fast traversal from children to parents (back)
|
290
|
+
class BackLinkedDirectedGraph < DirectedGraph
|
291
|
+
def initialize(*args)
|
292
|
+
super
|
293
|
+
@back_link_map = HashOfHash.new {Array.new} # [to][from] -> array of links
|
294
|
+
@incoming_links_info = DefaultInitHash.new {Array.new}
|
295
|
+
end
|
296
|
+
|
297
|
+
def add_link(from, to, informationOnLink = nil)
|
298
|
+
link = super
|
299
|
+
links_to_from(to, from).push link
|
300
|
+
if informationOnLink and
|
301
|
+
!@incoming_links_info[to].include?(informationOnLink)
|
302
|
+
@incoming_links_info[to].push informationOnLink
|
303
|
+
end
|
304
|
+
link
|
305
|
+
end
|
306
|
+
|
307
|
+
def incoming_links_info(node)
|
308
|
+
@incoming_links_info[node]
|
309
|
+
end
|
310
|
+
|
311
|
+
def back_transition(node, backLinkInfo)
|
312
|
+
link = links_to(node).detect {|l| l.info == backLinkInfo}
|
313
|
+
begin
|
314
|
+
link.from
|
315
|
+
rescue Exception
|
316
|
+
raise GraphTraversalException.new(node, links_to(node), backLinkInfo)
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
def back_traverse(state, alongLinksWithInfo = [])
|
321
|
+
len = alongLinksWithInfo.length
|
322
|
+
alongLinksWithInfo = alongLinksWithInfo.clone
|
323
|
+
while len > 0
|
324
|
+
state = back_transition(state, alongLinksWithInfo.pop)
|
325
|
+
len -= 1
|
326
|
+
end
|
327
|
+
state
|
328
|
+
end
|
329
|
+
|
330
|
+
def links_to(node)
|
331
|
+
@back_link_map[node].map {|from, links| links}.flatten
|
332
|
+
end
|
333
|
+
|
334
|
+
protected
|
335
|
+
|
336
|
+
def links_to_from(to, from)
|
337
|
+
@back_link_map[to][from]
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
def calc_masks(start, stop, masks = Array.new)
|
342
|
+
mask = 1 << start
|
343
|
+
(start..stop).each {|i| masks[i] = mask; mask <<= 1}
|
344
|
+
masks
|
345
|
+
end
|
346
|
+
|
347
|
+
class BooleanMatrix
|
348
|
+
def initialize(objects)
|
349
|
+
@index, @objects, @matrix = Hash.new, objects, Array.new
|
350
|
+
cnt = 0
|
351
|
+
objects.each do |o|
|
352
|
+
@index[o] = cnt
|
353
|
+
@matrix[cnt] = 0 # Use Integers to represent the booleans
|
354
|
+
cnt += 1
|
355
|
+
end
|
356
|
+
@num_obects = cnt
|
357
|
+
end
|
358
|
+
|
359
|
+
@@masks_max = 1000
|
360
|
+
@@masks = calc_masks(0,@@masks_max)
|
361
|
+
|
362
|
+
def mask(index)
|
363
|
+
mask = @@masks[index]
|
364
|
+
unless mask
|
365
|
+
calc_masks(@@masks_max+1, index, @@masks)
|
366
|
+
mask = @masks[index]
|
367
|
+
end
|
368
|
+
mask
|
369
|
+
end
|
370
|
+
|
371
|
+
def or(index1, index2)
|
372
|
+
@matrix[index1] |= @matrix[index2]
|
373
|
+
end
|
374
|
+
|
375
|
+
def indices(anInteger)
|
376
|
+
index = 0
|
377
|
+
while anInteger > 0
|
378
|
+
yeild(index) if anInteger & 1
|
379
|
+
anInteger >>= 1
|
380
|
+
index += 1
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
def directed_graph
|
385
|
+
dg = Directedgraph.new
|
386
|
+
@matrix.each_with_index do |v,i|
|
387
|
+
indices(v) do |index|
|
388
|
+
dg.link_nodes(@objects[i], @objects[index])
|
389
|
+
end
|
390
|
+
end
|
391
|
+
dg
|
392
|
+
end
|
393
|
+
|
394
|
+
def transitive_closure
|
395
|
+
for i in (0..@num_obects)
|
396
|
+
for j in (0..@num_obects)
|
397
|
+
|
398
|
+
end
|
399
|
+
end
|
400
|
+
end
|
401
|
+
end
|
@@ -0,0 +1,393 @@
|
|
1
|
+
require 'rpdf2txt-rockit/graphdrawing'
|
2
|
+
require 'rpdf2txt-rockit/version'
|
3
|
+
require 'rpdf2txt-rockit/sourcecode_dumpable'
|
4
|
+
require 'rpdf2txt-rockit/parsing_ambiguities'
|
5
|
+
|
6
|
+
class ParseException < Exception; end;
|
7
|
+
|
8
|
+
# Generalized LR Parsing class
|
9
|
+
#
|
10
|
+
# This is a modification of Jan Rekers and Eelco Vissers Generalized LR
|
11
|
+
# parsers which in turn are derived from the Tomita parsing algorithm. The
|
12
|
+
# main feature of these kinds of parsers is that aribtrary long lookahead is
|
13
|
+
# used (when needed) since a parser is forked off every time there is an
|
14
|
+
# ambiguity.
|
15
|
+
#
|
16
|
+
# This implementation assumes that the ambiguities (arising from lack of
|
17
|
+
# lookahead) are resolved later; it does not handle ambiguities arising
|
18
|
+
# from the grammar. However, it can easily be extended to return a parse tree
|
19
|
+
# forest with all possible parse trees if there is a need for that.
|
20
|
+
# Alternatively, the user can resolve ambiguities in the grammar by specifying
|
21
|
+
# production priorities.
|
22
|
+
#
|
23
|
+
# The modification I've done is so that multiple token streams from the lexer
|
24
|
+
# can be handled. This allows simpler specification of lexers while still
|
25
|
+
# leading to valid parses as long as the grammar is unambigous.
|
26
|
+
#
|
27
|
+
# The algorithm used is copyright (c) 2001 Robert Feldt.
|
28
|
+
#
|
29
|
+
class GeneralizedLrParser
|
30
|
+
include SourceCodeDumpable
|
31
|
+
|
32
|
+
def initialize(aParseTable, aLexer = nil)
|
33
|
+
@parse_table = aParseTable
|
34
|
+
# puts @parse_table.inspect
|
35
|
+
if aLexer
|
36
|
+
@lexer = aLexer
|
37
|
+
else
|
38
|
+
tokens = @parse_table.tokens.clone
|
39
|
+
tokens.delete(:EOF)
|
40
|
+
@lexer = ForkingRegexpLexer.new(tokens)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def parser_src_header
|
45
|
+
"# Parser for #{@parse_table.language}\n" +
|
46
|
+
"# created by Rockit version #{rockit_version} on #{Time.new.inspect}\n" +
|
47
|
+
"# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se\n" +
|
48
|
+
"# and licensed under GPL\n" +
|
49
|
+
"# but this parser is under LGPL\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_src(assignToName = nil, nameHash = {})
|
53
|
+
ptname = "@@parse_table" + self.object_id.inspect.gsub('-', '_')
|
54
|
+
parser_src_header + @parse_table.to_src(ptname) + "\n" +
|
55
|
+
assign_to(assignToName,
|
56
|
+
new_of_my_type(as_code(ptname)))
|
57
|
+
end
|
58
|
+
|
59
|
+
def parse(aString)
|
60
|
+
@string_being_parsed = aString
|
61
|
+
@stacks_to_act_on, @accepted_stacks, @stacks_to_shift = [], [], []
|
62
|
+
@lexer.init(aString)
|
63
|
+
start_state = @parse_table.start_state
|
64
|
+
@active_stacks = [ParseStack.new(start_state, @lexer)]
|
65
|
+
@cnt, @reducer_cnt = -1, 0
|
66
|
+
while @active_stacks.length > 0
|
67
|
+
# File.open("as#{@cnt+=1}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
|
68
|
+
@stacks_to_shift.clear
|
69
|
+
@stacks_to_act_on = @active_stacks.clone
|
70
|
+
actor(@stacks_to_act_on.shift) while @stacks_to_act_on.length > 0
|
71
|
+
shifter
|
72
|
+
end
|
73
|
+
if @accepted_stacks.length > 0
|
74
|
+
tree = @accepted_stacks.first.links_to_stack_in_state?(start_state).tree
|
75
|
+
check_and_report_ambiguity tree
|
76
|
+
return tree
|
77
|
+
else
|
78
|
+
handle_parse_error
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
protected
|
83
|
+
|
84
|
+
def check_and_report_ambiguity(tree)
|
85
|
+
tree.each_node do |node|
|
86
|
+
if node.kind_of?(AmbiguityNode)
|
87
|
+
raise AmbigousParseException.new(@string_being_parsed, tree,
|
88
|
+
*(node.ambigous_trees))
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def handle_parse_error
|
94
|
+
if @last_active_stacks
|
95
|
+
str = "No valid token found on stacks:\n"
|
96
|
+
@last_active_stacks.each_with_index do |stack, i|
|
97
|
+
tokens = stack.lexer.peek
|
98
|
+
str += "stack #{i}: #{context(stack.lexer.position.char_position)}" +
|
99
|
+
"in state #{stack.state}\n" +
|
100
|
+
"the lexer returns tokens = #{tokens.inspect} (#{tokens.map{|t| t.token_type.to_s}.inspect})\n" +
|
101
|
+
"and the valid tokens = #{@parse_table.valid_tokens(stack.state).inspect}\n"
|
102
|
+
end
|
103
|
+
else
|
104
|
+
str = "String could not be parsed"
|
105
|
+
end
|
106
|
+
raise ParseException, str
|
107
|
+
end
|
108
|
+
|
109
|
+
def context(position)
|
110
|
+
line, startpos = get_line_with_position(@string_being_parsed, position)
|
111
|
+
indent = (" " * (position - startpos + 1))
|
112
|
+
"on line:\n '" + line.to_s + "'\n " +
|
113
|
+
indent + "^\n " +
|
114
|
+
indent + "|\n " +
|
115
|
+
indent + "--- Parse error!!\n"
|
116
|
+
end
|
117
|
+
|
118
|
+
def get_line_with_position(string, position)
|
119
|
+
startpos = position
|
120
|
+
len = string.length
|
121
|
+
while string[startpos,1] != "\n" and startpos > 1
|
122
|
+
startpos -= 1
|
123
|
+
end
|
124
|
+
endpos = position
|
125
|
+
while string[endpos,1] != "\n" and endpos < len
|
126
|
+
endpos += 1
|
127
|
+
end
|
128
|
+
return string[startpos+1...endpos], startpos+1
|
129
|
+
end
|
130
|
+
|
131
|
+
def actor(stack)
|
132
|
+
#puts "actor(#{stack.state}) @stacks_to_act_on = #{@stacks_to_act_on.map{|s| s.state}.inspect}, @active_stacks = #{@active_stacks.map{|s| s.state}.inspect}"
|
133
|
+
tokens = stack.lexer.peek
|
134
|
+
#print "tokens = #{tokens.inspect}, "
|
135
|
+
tokens.each do |token|
|
136
|
+
#print "state = #{stack.state.inspect}, "
|
137
|
+
actions = @parse_table.actions(stack.state, token.token_type)
|
138
|
+
next unless actions
|
139
|
+
#puts "lexer = #{stack.lexer.inspect} (#{stack.lexer.id}), token = #{token.inspect}, actions = #{actions.inspect}"
|
140
|
+
actions.each do |action|
|
141
|
+
case action[0]
|
142
|
+
when :SHIFT
|
143
|
+
@stacks_to_shift.push [stack, action[1], token]
|
144
|
+
when :REDUCE
|
145
|
+
do_reductions(stack, action[1], stack.lexer)
|
146
|
+
when :ACCEPT
|
147
|
+
@accepted_stacks.push stack
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def do_reductions(stack, productionNumber, lexer, pathsInludingLink = nil)
|
154
|
+
production = @parse_table.production(productionNumber)
|
155
|
+
#puts @parse_table.priorities.inspect
|
156
|
+
paths =
|
157
|
+
if @parse_table.priorities.in_some_conflict?(production)
|
158
|
+
# Only return valid paths, ie. paths without conflicts. Prune the
|
159
|
+
# invalid paths.
|
160
|
+
stack.valid_paths_of_length_with_pruning(production.length,
|
161
|
+
pathsInludingLink) do |l,p|
|
162
|
+
not @parse_table.priorities.conflict?(l.production, p, production)
|
163
|
+
end
|
164
|
+
else
|
165
|
+
stack.paths_of_length(production.length, pathsInludingLink)
|
166
|
+
end
|
167
|
+
paths.each do |path|
|
168
|
+
reducer(path.to, @parse_table.goto(path.to.state, productionNumber),
|
169
|
+
lexer, production, trees_on_path(path), productionNumber)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def reducer(stack, newState, newLexer, production, childTrees, productionNum)
|
174
|
+
# puts "#{@reducer_cnt+=1}: reducer(#{stack.state}, #{production.inspect})\n"
|
175
|
+
tree = production.create_tree(childTrees)
|
176
|
+
|
177
|
+
# ywesee
|
178
|
+
if(tree)
|
179
|
+
range = stack.pointer..newLexer.scanner.pointer
|
180
|
+
tree.raw_src = (@lexer.scanner.string[range] || '').strip
|
181
|
+
end
|
182
|
+
|
183
|
+
existing_stack = @active_stacks.detect do |as|
|
184
|
+
as.state==newState and as.lexer==newLexer
|
185
|
+
end
|
186
|
+
if existing_stack
|
187
|
+
# There is already a stack with the same state and lexer, so instead
|
188
|
+
# of creating a new stack we re-use the existing one.
|
189
|
+
if (existing_link = existing_stack.links_to_stack?(stack))
|
190
|
+
# There is already a link to the stack => Ambiguity unless existing
|
191
|
+
# tree same as new tree. The latter can happen when multiple token
|
192
|
+
# types that all match the current string are in the follow set of
|
193
|
+
# the same production.
|
194
|
+
if tree != existing_link.tree
|
195
|
+
handle_ambiguity(existing_stack, existing_link, tree, production)
|
196
|
+
end
|
197
|
+
else
|
198
|
+
new_link = existing_stack.add_link(stack, tree, production)
|
199
|
+
recheck_stacks(@active_stacks - @stacks_to_act_on, new_link)
|
200
|
+
end
|
201
|
+
else
|
202
|
+
new_stack = ParseStack.new(newState, newLexer)
|
203
|
+
new_stack.add_link(stack, tree, production)
|
204
|
+
@stacks_to_act_on.push new_stack
|
205
|
+
@active_stacks.push new_stack
|
206
|
+
end
|
207
|
+
# File.open("as#{@cnt}_#{@reducer_cnt}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
|
208
|
+
end
|
209
|
+
|
210
|
+
def recheck_stacks(stacks, link)
|
211
|
+
# Recheck stacks to see if new reductions are possible including the new
|
212
|
+
# link
|
213
|
+
stacks.each do |stack|
|
214
|
+
actions(stack).each do |action|
|
215
|
+
if action[0] == :REDUCE
|
216
|
+
do_reductions(stack, action[1], stack.lexer, link)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
def actions(stack)
|
223
|
+
actions, state = [], stack.state
|
224
|
+
stack.lexer.peek.each do |token|
|
225
|
+
actions.concat @parse_table.actions(state, token.token_type)
|
226
|
+
end
|
227
|
+
actions
|
228
|
+
end
|
229
|
+
|
230
|
+
def handle_ambiguity(existingStack, existingLink, newTree, production)
|
231
|
+
# We can extend the parser here to return the full parse forest.
|
232
|
+
# For now we simplify things and raise an exception.
|
233
|
+
existing_tree = existingLink.tree
|
234
|
+
if existing_tree.kind_of?(AmbiguityNode)
|
235
|
+
existing_tree.add_ambigoustree(newTree)
|
236
|
+
else
|
237
|
+
existingLink.tree = AmbiguityNode.new(existing_tree, newTree)
|
238
|
+
end
|
239
|
+
#alternatives = [newTree.compact!]
|
240
|
+
#existingStack.links.each {|link| alternatives.push link.tree.compact!}
|
241
|
+
#raise AmbigousParseException.new(@string_being_parsed, *alternatives)
|
242
|
+
end
|
243
|
+
|
244
|
+
def write_graphs
|
245
|
+
#File.open("atree1.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(newTree)}
|
246
|
+
#File.open("atree2.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(existingStack.links[0].tree)}
|
247
|
+
#File.open("as_at_ambiguity.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
|
248
|
+
end
|
249
|
+
|
250
|
+
def shifter
|
251
|
+
# Save current active_stacks if there is a parse error
|
252
|
+
@last_active_stacks = @active_stacks.clone
|
253
|
+
@active_stacks.clear
|
254
|
+
@stacks_to_shift.each do |stack, newstate, lexertoken|
|
255
|
+
tree = lexertoken.create_tree
|
256
|
+
existing_stack = @active_stacks.detect do |as|
|
257
|
+
as.state==newstate and as.lexer==lexertoken.lexer
|
258
|
+
end
|
259
|
+
if existing_stack
|
260
|
+
existing_stack.add_link(stack, tree)
|
261
|
+
else
|
262
|
+
new_stack = ParseStack.new(newstate, lexertoken.lexer)
|
263
|
+
new_stack.add_link(stack, tree)
|
264
|
+
@active_stacks.push new_stack
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
def trees_on_path(path)
|
270
|
+
path.links.reverse.map {|link| link.tree}
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
class ParseStack
|
275
|
+
Link = Struct.new("Link", :stack, :tree, :production)
|
276
|
+
class Link
|
277
|
+
def inspect
|
278
|
+
"-#{tree.inspect}->#{stack.state.inspect}"
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
attr_reader :state, :lexer, :links
|
283
|
+
attr_reader :pointer # ywesee
|
284
|
+
|
285
|
+
def initialize(aState, aLexer)
|
286
|
+
@state, @lexer = aState, aLexer
|
287
|
+
@links = Array.new
|
288
|
+
end
|
289
|
+
|
290
|
+
# ywesee
|
291
|
+
def pointer=(value)
|
292
|
+
@pointer ||= value
|
293
|
+
end
|
294
|
+
|
295
|
+
def add_link(aParseStack, aTree, aProduction = nil)
|
296
|
+
aParseStack.pointer = @lexer.scanner.pointer # ywesee
|
297
|
+
@links.push(l = Link.new(aParseStack, aTree, aProduction))
|
298
|
+
l
|
299
|
+
end
|
300
|
+
|
301
|
+
def links_to_stack_in_state?(state)
|
302
|
+
@links.detect {|link| link.stack.state == state}
|
303
|
+
end
|
304
|
+
|
305
|
+
def links_to_stack?(stack)
|
306
|
+
@links.detect {|link| link.stack == stack}
|
307
|
+
end
|
308
|
+
|
309
|
+
def paths_of_length(length, aLink = nil)
|
310
|
+
return [] if length == 0
|
311
|
+
paths = Array.new
|
312
|
+
@links.each do |link|
|
313
|
+
child_paths = link.stack.paths_of_length(length-1)
|
314
|
+
if child_paths.length > 0
|
315
|
+
child_paths.each {|cpath| paths.push(StackPath.new(self, [link]).add_path(cpath))}
|
316
|
+
else
|
317
|
+
paths.push StackPath.new(self, [link])
|
318
|
+
end
|
319
|
+
end
|
320
|
+
delete_paths_without_link(paths, aLink)
|
321
|
+
end
|
322
|
+
|
323
|
+
def valid_paths_of_length_with_pruning(length, aLink = nil,
|
324
|
+
&validity_checker)
|
325
|
+
return [] if length == 0
|
326
|
+
paths, new_links = Array.new, Array.new
|
327
|
+
@links.each do |link|
|
328
|
+
if validity_checker.call(link, length-1)
|
329
|
+
if length == 1
|
330
|
+
new_links.push(link)
|
331
|
+
paths.push(StackPath.new(self, [link]))
|
332
|
+
else
|
333
|
+
child_paths =
|
334
|
+
link.stack.valid_paths_of_length_with_pruning(length-1,
|
335
|
+
&validity_checker)
|
336
|
+
if child_paths.length > 0
|
337
|
+
new_links.push(link)
|
338
|
+
child_paths.each do |cpath|
|
339
|
+
paths.push(StackPath.new(self, [link]).add_path(cpath))
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
@links = new_links
|
346
|
+
delete_paths_without_link(paths, aLink)
|
347
|
+
end
|
348
|
+
|
349
|
+
def inspect
|
350
|
+
"PSt(#{@state}, #{@links.inspect}, #{@lexer.inspect})"
|
351
|
+
end
|
352
|
+
|
353
|
+
private
|
354
|
+
|
355
|
+
def delete_paths_without_link(paths, aLink)
|
356
|
+
if aLink
|
357
|
+
return paths.find_all {|path| path.includes_link?(aLink)}
|
358
|
+
else
|
359
|
+
return paths
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
class StackPath
|
365
|
+
attr_reader :to, :from
|
366
|
+
attr_reader :links
|
367
|
+
|
368
|
+
def initialize(from, links = [])
|
369
|
+
@from, @links = from, Array.new
|
370
|
+
links.each {|link| self.add_link(link)}
|
371
|
+
end
|
372
|
+
|
373
|
+
def add_link(link)
|
374
|
+
@links.push link
|
375
|
+
@to = link.stack
|
376
|
+
end
|
377
|
+
|
378
|
+
def add_path(aStackPath)
|
379
|
+
@links.concat aStackPath.links
|
380
|
+
@to = @links.last.stack
|
381
|
+
self
|
382
|
+
end
|
383
|
+
|
384
|
+
def includes_link?(link)
|
385
|
+
@links.detect {|l| l==link}
|
386
|
+
end
|
387
|
+
|
388
|
+
def inspect
|
389
|
+
"#{from.state}(#{from.lexer.position.char_position}) " +
|
390
|
+
@links.map {|l| "-#{l.tree.inspect}-> #{l.stack.state} (#{l.stack.lexer.position.char_position}) "}.join
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|