rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
require 'rpdf2txt-rockit/base_extensions'
|
|
2
|
+
require 'rpdf2txt-rockit/graphviz_dot'
|
|
3
|
+
|
|
4
|
+
class HashOfHash < DefaultInitHash
|
|
5
|
+
def initialize(&initBlock)
|
|
6
|
+
super do
|
|
7
|
+
if initBlock
|
|
8
|
+
DefaultInitHash.new(&initBlock)
|
|
9
|
+
else
|
|
10
|
+
Hash.new
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
GraphLink = Struct.new("GraphLink", :from, :to, :info)
|
|
17
|
+
class GraphLink
|
|
18
|
+
def inspect
|
|
19
|
+
info_str = info ? info.inspect + "-" : ""
|
|
20
|
+
"#{from.inspect}-#{info_str}>#{to.inspect}"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
class GraphTraversalException < Exception
|
|
25
|
+
attr_reader :node, :links, :link_info
|
|
26
|
+
def initialize(node, links, linkInfo)
|
|
27
|
+
@node, @links, @link_info = node, links, linkInfo
|
|
28
|
+
super(message)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def message
|
|
32
|
+
"There is no link from #{@node.inspect} having info #{@link_info.inspect} (valid links are #{@links.inspect})"
|
|
33
|
+
end
|
|
34
|
+
alias inspect message
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
class DirectedGraph
|
|
38
|
+
# This is a memory expensive variant that manages several additional
|
|
39
|
+
# information data structures to cut down on processing when the graph
|
|
40
|
+
# has been built.
|
|
41
|
+
|
|
42
|
+
attr_reader :links
|
|
43
|
+
|
|
44
|
+
def initialize
|
|
45
|
+
@link_map = HashOfHash.new {Array.new} # [from][to] -> array of links
|
|
46
|
+
@links = Array.new # All links in one array
|
|
47
|
+
@is_root = Hash.new # true iff root node
|
|
48
|
+
@is_leaf = Hash.new # true iff leaf node
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def nodes
|
|
52
|
+
@is_root.keys
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def add_node(node)
|
|
56
|
+
unless include_node?(node)
|
|
57
|
+
@is_root[node] = @is_leaf[node] = true
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def root?(node)
|
|
62
|
+
@is_root[node]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def leaf?(node)
|
|
66
|
+
@is_leaf[node]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def include_node?(node)
|
|
70
|
+
@is_root.has_key?(node)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def links_from_to(from, to)
|
|
74
|
+
@link_map[from][to]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def links_from(node)
|
|
78
|
+
@link_map[node].map {|to, links| links}.flatten
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def children(node)
|
|
82
|
+
@link_map[node].keys.select {|k| @link_map[node][k].length > 0}
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# (Forced) add link will always add link even if there are already links
|
|
86
|
+
# between the nodes.
|
|
87
|
+
def add_link(from, to, informationOnLink = nil)
|
|
88
|
+
add_link_nodes(from, to)
|
|
89
|
+
link = GraphLink.new(from, to, informationOnLink)
|
|
90
|
+
links_from_to(from, to).push link
|
|
91
|
+
add_to_links(link)
|
|
92
|
+
link
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def add_link_nodes(from, to)
|
|
96
|
+
add_node(from)
|
|
97
|
+
add_node(to)
|
|
98
|
+
@is_leaf[from] = @is_root[to] = false
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Add link if not already linked
|
|
102
|
+
def link_nodes(from, to, info = nil)
|
|
103
|
+
links_from_to?(from, to) ? nil : add_link(from, to, info)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def links_from_to?(from, to)
|
|
107
|
+
not links_from_to(from, to).empty?
|
|
108
|
+
end
|
|
109
|
+
alias linked? links_from_to?
|
|
110
|
+
|
|
111
|
+
def add_to_links(link)
|
|
112
|
+
@links.push link
|
|
113
|
+
end
|
|
114
|
+
private :add_to_links
|
|
115
|
+
|
|
116
|
+
def each_reachable_node_once_depth_first(node, inclusive = true, &block)
|
|
117
|
+
children(node).each do |c|
|
|
118
|
+
recurse_each_reachable_depth_first_visited(c, Hash.new, &block)
|
|
119
|
+
end
|
|
120
|
+
block.call(node) if inclusive
|
|
121
|
+
end
|
|
122
|
+
alias each_reachable_node each_reachable_node_once_depth_first
|
|
123
|
+
|
|
124
|
+
def recurse_each_reachable_depth_first_visited(node, visited, &block)
|
|
125
|
+
visited[node] = true
|
|
126
|
+
children(node).each do |c|
|
|
127
|
+
unless visited[c]
|
|
128
|
+
recurse_each_reachable_depth_first_visited(c, visited, &block)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
block.call(node)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def each_reachable_node_once_breadth_first(node, inclusive = true, &block)
|
|
135
|
+
block.call(node) if inclusive
|
|
136
|
+
children(node).each do |c|
|
|
137
|
+
recurse_each_reachable_breadth_first_visited(c, Hash.new, &block)
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
alias each_reachable_node each_reachable_node_once_depth_first
|
|
141
|
+
|
|
142
|
+
def recurse_each_reachable_breadth_first_visited(node, visited, &block)
|
|
143
|
+
visited[node] = true
|
|
144
|
+
block.call(node)
|
|
145
|
+
children(node).each do |c|
|
|
146
|
+
unless visited[c]
|
|
147
|
+
recurse_each_reachable_breadth_first_visited(c, visited, &block)
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def root_nodes
|
|
153
|
+
@is_root.reject {|key,val| val == false}.keys
|
|
154
|
+
end
|
|
155
|
+
alias_method :roots, :root_nodes
|
|
156
|
+
|
|
157
|
+
def leaf_nodes
|
|
158
|
+
@is_leaf.reject {|key,val| val == false}.keys
|
|
159
|
+
end
|
|
160
|
+
alias_method :leafs, :leaf_nodes
|
|
161
|
+
|
|
162
|
+
def internal_node?(node)
|
|
163
|
+
!root?(node) and !leaf?(node)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def internal_nodes
|
|
167
|
+
nodes.reject {|n| root?(n) or leaf?(n)}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def recurse_cyclic?(node, visited)
|
|
171
|
+
visited[node] = true
|
|
172
|
+
children(node).each do |c|
|
|
173
|
+
return true if visited[c] || recurse_cyclic?(c, visited)
|
|
174
|
+
end
|
|
175
|
+
false
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def cyclic?
|
|
179
|
+
visited = Hash.new
|
|
180
|
+
root_nodes.each {|root| return true if recurse_cyclic?(root, visited)}
|
|
181
|
+
false
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def acyclic?
|
|
185
|
+
not cyclic?
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def transition(state, linkInfo)
|
|
189
|
+
link = links_from(state).detect {|l| l.info == linkInfo}
|
|
190
|
+
begin
|
|
191
|
+
link.to
|
|
192
|
+
rescue Exception
|
|
193
|
+
raise GraphTraversalException.new(state, links_from(state), linkInfo)
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def traverse(fromState, alongLinksWithInfo = [])
|
|
198
|
+
state, len = fromState, alongLinksWithInfo.length
|
|
199
|
+
alongLinksWithInfo = alongLinksWithInfo.clone
|
|
200
|
+
while len > 0
|
|
201
|
+
state = transition(state, alongLinksWithInfo.shift)
|
|
202
|
+
len -= 1
|
|
203
|
+
end
|
|
204
|
+
state
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def to_dot(nodeShaper = nil, nodeLabeler = nil, linkLabeler = nil)
|
|
208
|
+
f = DotGraphFormatter.new(nodeShaper, nodeLabeler, linkLabeler)
|
|
209
|
+
f.format(nodes, links)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def to_postscript_file(filename, nodeShaper = nil, nodeLabeler = nil,
|
|
213
|
+
linkLabeler = nil)
|
|
214
|
+
to_dot(nodeShaper, nodeLabeler, linkLabeler).write_to_file(filename)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Floyd-Warshal algorithm which should be O(n^3) where n is the number of
|
|
218
|
+
# nodes. We can probably work a bit on the constant factors!
|
|
219
|
+
def transitive_closure_floyd_warshal
|
|
220
|
+
vertices = nodes
|
|
221
|
+
tcg = DirectedGraph.new
|
|
222
|
+
num_nodes = vertices.length
|
|
223
|
+
|
|
224
|
+
# Direct links
|
|
225
|
+
for k in (0...num_nodes)
|
|
226
|
+
for s in (0...num_nodes)
|
|
227
|
+
vk, vs = vertices[k], vertices[s]
|
|
228
|
+
if vk == vs
|
|
229
|
+
tcg.link_nodes(vk,vs)
|
|
230
|
+
elsif linked?(vk, vs)
|
|
231
|
+
tcg.link_nodes(vk,vs)
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Indirect links
|
|
237
|
+
for i in (0...num_nodes)
|
|
238
|
+
for j in (0...num_nodes)
|
|
239
|
+
for k in (0...num_nodes)
|
|
240
|
+
vi, vj, vk = vertices[i], vertices[j], vertices[k]
|
|
241
|
+
if not tcg.linked?(vi,vj)
|
|
242
|
+
tcg.link_nodes(vi, vj) if linked?(vi,vk) and linked?(vk,vj)
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
tcg
|
|
248
|
+
end
|
|
249
|
+
alias_method :transitive_closure, :transitive_closure_floyd_warshal
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Parallel propagation in directed acyclic graphs. Should be faster than
|
|
253
|
+
# traversing all links from each start node if the graph is dense so that
|
|
254
|
+
# many traversals can be merged.
|
|
255
|
+
class DagPropagator
|
|
256
|
+
def initialize(directedGraph, startNodes, &propagationBlock)
|
|
257
|
+
@graph, @block = directedGraph, propagationBlock
|
|
258
|
+
init_start_nodes(startNodes)
|
|
259
|
+
@visited = Hash.new
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def init_start_nodes(startNodes)
|
|
263
|
+
@startnodes = startNodes
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def propagate
|
|
267
|
+
@visited.clear
|
|
268
|
+
propagate_recursive
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def propagate_recursive
|
|
272
|
+
next_start_nodes = Array.new
|
|
273
|
+
@startnodes.each do |parent|
|
|
274
|
+
@visited[parent] = true
|
|
275
|
+
@graph.children(parent).each do |child|
|
|
276
|
+
@block.call(parent, child)
|
|
277
|
+
unless @visited[child] or next_start_nodes.include?(child)
|
|
278
|
+
next_start_nodes.push(child)
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
if next_start_nodes.length > 0
|
|
283
|
+
@startnodes = next_start_nodes
|
|
284
|
+
propagate_recursive
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Directed graph with fast traversal from children to parents (back)
|
|
290
|
+
class BackLinkedDirectedGraph < DirectedGraph
|
|
291
|
+
def initialize(*args)
|
|
292
|
+
super
|
|
293
|
+
@back_link_map = HashOfHash.new {Array.new} # [to][from] -> array of links
|
|
294
|
+
@incoming_links_info = DefaultInitHash.new {Array.new}
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def add_link(from, to, informationOnLink = nil)
|
|
298
|
+
link = super
|
|
299
|
+
links_to_from(to, from).push link
|
|
300
|
+
if informationOnLink and
|
|
301
|
+
!@incoming_links_info[to].include?(informationOnLink)
|
|
302
|
+
@incoming_links_info[to].push informationOnLink
|
|
303
|
+
end
|
|
304
|
+
link
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def incoming_links_info(node)
|
|
308
|
+
@incoming_links_info[node]
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
def back_transition(node, backLinkInfo)
|
|
312
|
+
link = links_to(node).detect {|l| l.info == backLinkInfo}
|
|
313
|
+
begin
|
|
314
|
+
link.from
|
|
315
|
+
rescue Exception
|
|
316
|
+
raise GraphTraversalException.new(node, links_to(node), backLinkInfo)
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def back_traverse(state, alongLinksWithInfo = [])
|
|
321
|
+
len = alongLinksWithInfo.length
|
|
322
|
+
alongLinksWithInfo = alongLinksWithInfo.clone
|
|
323
|
+
while len > 0
|
|
324
|
+
state = back_transition(state, alongLinksWithInfo.pop)
|
|
325
|
+
len -= 1
|
|
326
|
+
end
|
|
327
|
+
state
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def links_to(node)
|
|
331
|
+
@back_link_map[node].map {|from, links| links}.flatten
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
protected
|
|
335
|
+
|
|
336
|
+
def links_to_from(to, from)
|
|
337
|
+
@back_link_map[to][from]
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
def calc_masks(start, stop, masks = Array.new)
|
|
342
|
+
mask = 1 << start
|
|
343
|
+
(start..stop).each {|i| masks[i] = mask; mask <<= 1}
|
|
344
|
+
masks
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
class BooleanMatrix
|
|
348
|
+
def initialize(objects)
|
|
349
|
+
@index, @objects, @matrix = Hash.new, objects, Array.new
|
|
350
|
+
cnt = 0
|
|
351
|
+
objects.each do |o|
|
|
352
|
+
@index[o] = cnt
|
|
353
|
+
@matrix[cnt] = 0 # Use Integers to represent the booleans
|
|
354
|
+
cnt += 1
|
|
355
|
+
end
|
|
356
|
+
@num_obects = cnt
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
@@masks_max = 1000
|
|
360
|
+
@@masks = calc_masks(0,@@masks_max)
|
|
361
|
+
|
|
362
|
+
def mask(index)
|
|
363
|
+
mask = @@masks[index]
|
|
364
|
+
unless mask
|
|
365
|
+
calc_masks(@@masks_max+1, index, @@masks)
|
|
366
|
+
mask = @masks[index]
|
|
367
|
+
end
|
|
368
|
+
mask
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def or(index1, index2)
|
|
372
|
+
@matrix[index1] |= @matrix[index2]
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
def indices(anInteger)
|
|
376
|
+
index = 0
|
|
377
|
+
while anInteger > 0
|
|
378
|
+
yeild(index) if anInteger & 1
|
|
379
|
+
anInteger >>= 1
|
|
380
|
+
index += 1
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def directed_graph
|
|
385
|
+
dg = Directedgraph.new
|
|
386
|
+
@matrix.each_with_index do |v,i|
|
|
387
|
+
indices(v) do |index|
|
|
388
|
+
dg.link_nodes(@objects[i], @objects[index])
|
|
389
|
+
end
|
|
390
|
+
end
|
|
391
|
+
dg
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
def transitive_closure
|
|
395
|
+
for i in (0..@num_obects)
|
|
396
|
+
for j in (0..@num_obects)
|
|
397
|
+
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
end
|
|
401
|
+
end
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
require 'rpdf2txt-rockit/graphdrawing'
|
|
2
|
+
require 'rpdf2txt-rockit/version'
|
|
3
|
+
require 'rpdf2txt-rockit/sourcecode_dumpable'
|
|
4
|
+
require 'rpdf2txt-rockit/parsing_ambiguities'
|
|
5
|
+
|
|
6
|
+
class ParseException < Exception; end;
|
|
7
|
+
|
|
8
|
+
# Generalized LR Parsing class
|
|
9
|
+
#
|
|
10
|
+
# This is a modification of Jan Rekers and Eelco Vissers Generalized LR
|
|
11
|
+
# parsers which in turn are derived from the Tomita parsing algorithm. The
|
|
12
|
+
# main feature of these kinds of parsers is that aribtrary long lookahead is
|
|
13
|
+
# used (when needed) since a parser is forked off every time there is an
|
|
14
|
+
# ambiguity.
|
|
15
|
+
#
|
|
16
|
+
# This implementation assumes that the ambiguities (arising from lack of
|
|
17
|
+
# lookahead) are resolved later; it does not handle ambiguities arising
|
|
18
|
+
# from the grammar. However, it can easily be extended to return a parse tree
|
|
19
|
+
# forest with all possible parse trees if there is a need for that.
|
|
20
|
+
# Alternatively, the user can resolve ambiguities in the grammar by specifying
|
|
21
|
+
# production priorities.
|
|
22
|
+
#
|
|
23
|
+
# The modification I've done is so that multiple token streams from the lexer
|
|
24
|
+
# can be handled. This allows simpler specification of lexers while still
|
|
25
|
+
# leading to valid parses as long as the grammar is unambigous.
|
|
26
|
+
#
|
|
27
|
+
# The algorithm used is copyright (c) 2001 Robert Feldt.
|
|
28
|
+
#
|
|
29
|
+
class GeneralizedLrParser
|
|
30
|
+
include SourceCodeDumpable
|
|
31
|
+
|
|
32
|
+
def initialize(aParseTable, aLexer = nil)
|
|
33
|
+
@parse_table = aParseTable
|
|
34
|
+
# puts @parse_table.inspect
|
|
35
|
+
if aLexer
|
|
36
|
+
@lexer = aLexer
|
|
37
|
+
else
|
|
38
|
+
tokens = @parse_table.tokens.clone
|
|
39
|
+
tokens.delete(:EOF)
|
|
40
|
+
@lexer = ForkingRegexpLexer.new(tokens)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def parser_src_header
|
|
45
|
+
"# Parser for #{@parse_table.language}\n" +
|
|
46
|
+
"# created by Rockit version #{rockit_version} on #{Time.new.inspect}\n" +
|
|
47
|
+
"# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se\n" +
|
|
48
|
+
"# and licensed under GPL\n" +
|
|
49
|
+
"# but this parser is under LGPL\n"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def to_src(assignToName = nil, nameHash = {})
|
|
53
|
+
ptname = "@@parse_table" + self.object_id.inspect.gsub('-', '_')
|
|
54
|
+
parser_src_header + @parse_table.to_src(ptname) + "\n" +
|
|
55
|
+
assign_to(assignToName,
|
|
56
|
+
new_of_my_type(as_code(ptname)))
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def parse(aString)
|
|
60
|
+
@string_being_parsed = aString
|
|
61
|
+
@stacks_to_act_on, @accepted_stacks, @stacks_to_shift = [], [], []
|
|
62
|
+
@lexer.init(aString)
|
|
63
|
+
start_state = @parse_table.start_state
|
|
64
|
+
@active_stacks = [ParseStack.new(start_state, @lexer)]
|
|
65
|
+
@cnt, @reducer_cnt = -1, 0
|
|
66
|
+
while @active_stacks.length > 0
|
|
67
|
+
# File.open("as#{@cnt+=1}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
|
|
68
|
+
@stacks_to_shift.clear
|
|
69
|
+
@stacks_to_act_on = @active_stacks.clone
|
|
70
|
+
actor(@stacks_to_act_on.shift) while @stacks_to_act_on.length > 0
|
|
71
|
+
shifter
|
|
72
|
+
end
|
|
73
|
+
if @accepted_stacks.length > 0
|
|
74
|
+
tree = @accepted_stacks.first.links_to_stack_in_state?(start_state).tree
|
|
75
|
+
check_and_report_ambiguity tree
|
|
76
|
+
return tree
|
|
77
|
+
else
|
|
78
|
+
handle_parse_error
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
protected
|
|
83
|
+
|
|
84
|
+
def check_and_report_ambiguity(tree)
|
|
85
|
+
tree.each_node do |node|
|
|
86
|
+
if node.kind_of?(AmbiguityNode)
|
|
87
|
+
raise AmbigousParseException.new(@string_being_parsed, tree,
|
|
88
|
+
*(node.ambigous_trees))
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def handle_parse_error
|
|
94
|
+
if @last_active_stacks
|
|
95
|
+
str = "No valid token found on stacks:\n"
|
|
96
|
+
@last_active_stacks.each_with_index do |stack, i|
|
|
97
|
+
tokens = stack.lexer.peek
|
|
98
|
+
str += "stack #{i}: #{context(stack.lexer.position.char_position)}" +
|
|
99
|
+
"in state #{stack.state}\n" +
|
|
100
|
+
"the lexer returns tokens = #{tokens.inspect} (#{tokens.map{|t| t.token_type.to_s}.inspect})\n" +
|
|
101
|
+
"and the valid tokens = #{@parse_table.valid_tokens(stack.state).inspect}\n"
|
|
102
|
+
end
|
|
103
|
+
else
|
|
104
|
+
str = "String could not be parsed"
|
|
105
|
+
end
|
|
106
|
+
raise ParseException, str
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def context(position)
|
|
110
|
+
line, startpos = get_line_with_position(@string_being_parsed, position)
|
|
111
|
+
indent = (" " * (position - startpos + 1))
|
|
112
|
+
"on line:\n '" + line.to_s + "'\n " +
|
|
113
|
+
indent + "^\n " +
|
|
114
|
+
indent + "|\n " +
|
|
115
|
+
indent + "--- Parse error!!\n"
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def get_line_with_position(string, position)
|
|
119
|
+
startpos = position
|
|
120
|
+
len = string.length
|
|
121
|
+
while string[startpos,1] != "\n" and startpos > 1
|
|
122
|
+
startpos -= 1
|
|
123
|
+
end
|
|
124
|
+
endpos = position
|
|
125
|
+
while string[endpos,1] != "\n" and endpos < len
|
|
126
|
+
endpos += 1
|
|
127
|
+
end
|
|
128
|
+
return string[startpos+1...endpos], startpos+1
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def actor(stack)
|
|
132
|
+
#puts "actor(#{stack.state}) @stacks_to_act_on = #{@stacks_to_act_on.map{|s| s.state}.inspect}, @active_stacks = #{@active_stacks.map{|s| s.state}.inspect}"
|
|
133
|
+
tokens = stack.lexer.peek
|
|
134
|
+
#print "tokens = #{tokens.inspect}, "
|
|
135
|
+
tokens.each do |token|
|
|
136
|
+
#print "state = #{stack.state.inspect}, "
|
|
137
|
+
actions = @parse_table.actions(stack.state, token.token_type)
|
|
138
|
+
next unless actions
|
|
139
|
+
#puts "lexer = #{stack.lexer.inspect} (#{stack.lexer.id}), token = #{token.inspect}, actions = #{actions.inspect}"
|
|
140
|
+
actions.each do |action|
|
|
141
|
+
case action[0]
|
|
142
|
+
when :SHIFT
|
|
143
|
+
@stacks_to_shift.push [stack, action[1], token]
|
|
144
|
+
when :REDUCE
|
|
145
|
+
do_reductions(stack, action[1], stack.lexer)
|
|
146
|
+
when :ACCEPT
|
|
147
|
+
@accepted_stacks.push stack
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def do_reductions(stack, productionNumber, lexer, pathsInludingLink = nil)
|
|
154
|
+
production = @parse_table.production(productionNumber)
|
|
155
|
+
#puts @parse_table.priorities.inspect
|
|
156
|
+
paths =
|
|
157
|
+
if @parse_table.priorities.in_some_conflict?(production)
|
|
158
|
+
# Only return valid paths, ie. paths without conflicts. Prune the
|
|
159
|
+
# invalid paths.
|
|
160
|
+
stack.valid_paths_of_length_with_pruning(production.length,
|
|
161
|
+
pathsInludingLink) do |l,p|
|
|
162
|
+
not @parse_table.priorities.conflict?(l.production, p, production)
|
|
163
|
+
end
|
|
164
|
+
else
|
|
165
|
+
stack.paths_of_length(production.length, pathsInludingLink)
|
|
166
|
+
end
|
|
167
|
+
paths.each do |path|
|
|
168
|
+
reducer(path.to, @parse_table.goto(path.to.state, productionNumber),
|
|
169
|
+
lexer, production, trees_on_path(path), productionNumber)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def reducer(stack, newState, newLexer, production, childTrees, productionNum)
|
|
174
|
+
# puts "#{@reducer_cnt+=1}: reducer(#{stack.state}, #{production.inspect})\n"
|
|
175
|
+
tree = production.create_tree(childTrees)
|
|
176
|
+
|
|
177
|
+
# ywesee
|
|
178
|
+
if(tree)
|
|
179
|
+
range = stack.pointer..newLexer.scanner.pointer
|
|
180
|
+
tree.raw_src = (@lexer.scanner.string[range] || '').strip
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
existing_stack = @active_stacks.detect do |as|
|
|
184
|
+
as.state==newState and as.lexer==newLexer
|
|
185
|
+
end
|
|
186
|
+
if existing_stack
|
|
187
|
+
# There is already a stack with the same state and lexer, so instead
|
|
188
|
+
# of creating a new stack we re-use the existing one.
|
|
189
|
+
if (existing_link = existing_stack.links_to_stack?(stack))
|
|
190
|
+
# There is already a link to the stack => Ambiguity unless existing
|
|
191
|
+
# tree same as new tree. The latter can happen when multiple token
|
|
192
|
+
# types that all match the current string are in the follow set of
|
|
193
|
+
# the same production.
|
|
194
|
+
if tree != existing_link.tree
|
|
195
|
+
handle_ambiguity(existing_stack, existing_link, tree, production)
|
|
196
|
+
end
|
|
197
|
+
else
|
|
198
|
+
new_link = existing_stack.add_link(stack, tree, production)
|
|
199
|
+
recheck_stacks(@active_stacks - @stacks_to_act_on, new_link)
|
|
200
|
+
end
|
|
201
|
+
else
|
|
202
|
+
new_stack = ParseStack.new(newState, newLexer)
|
|
203
|
+
new_stack.add_link(stack, tree, production)
|
|
204
|
+
@stacks_to_act_on.push new_stack
|
|
205
|
+
@active_stacks.push new_stack
|
|
206
|
+
end
|
|
207
|
+
# File.open("as#{@cnt}_#{@reducer_cnt}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def recheck_stacks(stacks, link)
|
|
211
|
+
# Recheck stacks to see if new reductions are possible including the new
|
|
212
|
+
# link
|
|
213
|
+
stacks.each do |stack|
|
|
214
|
+
actions(stack).each do |action|
|
|
215
|
+
if action[0] == :REDUCE
|
|
216
|
+
do_reductions(stack, action[1], stack.lexer, link)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def actions(stack)
|
|
223
|
+
actions, state = [], stack.state
|
|
224
|
+
stack.lexer.peek.each do |token|
|
|
225
|
+
actions.concat @parse_table.actions(state, token.token_type)
|
|
226
|
+
end
|
|
227
|
+
actions
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def handle_ambiguity(existingStack, existingLink, newTree, production)
|
|
231
|
+
# We can extend the parser here to return the full parse forest.
|
|
232
|
+
# For now we simplify things and raise an exception.
|
|
233
|
+
existing_tree = existingLink.tree
|
|
234
|
+
if existing_tree.kind_of?(AmbiguityNode)
|
|
235
|
+
existing_tree.add_ambigoustree(newTree)
|
|
236
|
+
else
|
|
237
|
+
existingLink.tree = AmbiguityNode.new(existing_tree, newTree)
|
|
238
|
+
end
|
|
239
|
+
#alternatives = [newTree.compact!]
|
|
240
|
+
#existingStack.links.each {|link| alternatives.push link.tree.compact!}
|
|
241
|
+
#raise AmbigousParseException.new(@string_being_parsed, *alternatives)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def write_graphs
|
|
245
|
+
#File.open("atree1.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(newTree)}
|
|
246
|
+
#File.open("atree2.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(existingStack.links[0].tree)}
|
|
247
|
+
#File.open("as_at_ambiguity.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def shifter
|
|
251
|
+
# Save current active_stacks if there is a parse error
|
|
252
|
+
@last_active_stacks = @active_stacks.clone
|
|
253
|
+
@active_stacks.clear
|
|
254
|
+
@stacks_to_shift.each do |stack, newstate, lexertoken|
|
|
255
|
+
tree = lexertoken.create_tree
|
|
256
|
+
existing_stack = @active_stacks.detect do |as|
|
|
257
|
+
as.state==newstate and as.lexer==lexertoken.lexer
|
|
258
|
+
end
|
|
259
|
+
if existing_stack
|
|
260
|
+
existing_stack.add_link(stack, tree)
|
|
261
|
+
else
|
|
262
|
+
new_stack = ParseStack.new(newstate, lexertoken.lexer)
|
|
263
|
+
new_stack.add_link(stack, tree)
|
|
264
|
+
@active_stacks.push new_stack
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def trees_on_path(path)
|
|
270
|
+
path.links.reverse.map {|link| link.tree}
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
class ParseStack
|
|
275
|
+
Link = Struct.new("Link", :stack, :tree, :production)
|
|
276
|
+
class Link
|
|
277
|
+
def inspect
|
|
278
|
+
"-#{tree.inspect}->#{stack.state.inspect}"
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
attr_reader :state, :lexer, :links
|
|
283
|
+
attr_reader :pointer # ywesee
|
|
284
|
+
|
|
285
|
+
def initialize(aState, aLexer)
|
|
286
|
+
@state, @lexer = aState, aLexer
|
|
287
|
+
@links = Array.new
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# ywesee
|
|
291
|
+
def pointer=(value)
|
|
292
|
+
@pointer ||= value
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def add_link(aParseStack, aTree, aProduction = nil)
|
|
296
|
+
aParseStack.pointer = @lexer.scanner.pointer # ywesee
|
|
297
|
+
@links.push(l = Link.new(aParseStack, aTree, aProduction))
|
|
298
|
+
l
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def links_to_stack_in_state?(state)
|
|
302
|
+
@links.detect {|link| link.stack.state == state}
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def links_to_stack?(stack)
|
|
306
|
+
@links.detect {|link| link.stack == stack}
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def paths_of_length(length, aLink = nil)
|
|
310
|
+
return [] if length == 0
|
|
311
|
+
paths = Array.new
|
|
312
|
+
@links.each do |link|
|
|
313
|
+
child_paths = link.stack.paths_of_length(length-1)
|
|
314
|
+
if child_paths.length > 0
|
|
315
|
+
child_paths.each {|cpath| paths.push(StackPath.new(self, [link]).add_path(cpath))}
|
|
316
|
+
else
|
|
317
|
+
paths.push StackPath.new(self, [link])
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
delete_paths_without_link(paths, aLink)
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def valid_paths_of_length_with_pruning(length, aLink = nil,
|
|
324
|
+
&validity_checker)
|
|
325
|
+
return [] if length == 0
|
|
326
|
+
paths, new_links = Array.new, Array.new
|
|
327
|
+
@links.each do |link|
|
|
328
|
+
if validity_checker.call(link, length-1)
|
|
329
|
+
if length == 1
|
|
330
|
+
new_links.push(link)
|
|
331
|
+
paths.push(StackPath.new(self, [link]))
|
|
332
|
+
else
|
|
333
|
+
child_paths =
|
|
334
|
+
link.stack.valid_paths_of_length_with_pruning(length-1,
|
|
335
|
+
&validity_checker)
|
|
336
|
+
if child_paths.length > 0
|
|
337
|
+
new_links.push(link)
|
|
338
|
+
child_paths.each do |cpath|
|
|
339
|
+
paths.push(StackPath.new(self, [link]).add_path(cpath))
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
@links = new_links
|
|
346
|
+
delete_paths_without_link(paths, aLink)
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
def inspect
|
|
350
|
+
"PSt(#{@state}, #{@links.inspect}, #{@lexer.inspect})"
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
private
|
|
354
|
+
|
|
355
|
+
def delete_paths_without_link(paths, aLink)
|
|
356
|
+
if aLink
|
|
357
|
+
return paths.find_all {|path| path.includes_link?(aLink)}
|
|
358
|
+
else
|
|
359
|
+
return paths
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
class StackPath
|
|
365
|
+
attr_reader :to, :from
|
|
366
|
+
attr_reader :links
|
|
367
|
+
|
|
368
|
+
def initialize(from, links = [])
|
|
369
|
+
@from, @links = from, Array.new
|
|
370
|
+
links.each {|link| self.add_link(link)}
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
def add_link(link)
|
|
374
|
+
@links.push link
|
|
375
|
+
@to = link.stack
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
def add_path(aStackPath)
|
|
379
|
+
@links.concat aStackPath.links
|
|
380
|
+
@to = @links.last.stack
|
|
381
|
+
self
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def includes_link?(link)
|
|
385
|
+
@links.detect {|l| l==link}
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
def inspect
|
|
389
|
+
"#{from.state}(#{from.lexer.position.char_position}) " +
|
|
390
|
+
@links.map {|l| "-#{l.tree.inspect}-> #{l.stack.state} (#{l.stack.lexer.position.char_position}) "}.join
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
|