rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,364 @@
|
|
1
|
+
require 'rpdf2txt-rockit/syntax_tree'
|
2
|
+
require 'rpdf2txt-rockit/sourcecode_dumpable'
|
3
|
+
require 'rpdf2txt-rockit/bounded_lru_cache'
|
4
|
+
|
5
|
+
class TokenRegexp < Regexp
|
6
|
+
def initialize(aStringOrRegexp)
|
7
|
+
if aStringOrRegexp.class == String
|
8
|
+
@string = aStringOrRegexp
|
9
|
+
@string = "^(" + @string + ")" unless @string[0,1] == "^"
|
10
|
+
super @string
|
11
|
+
elsif aStringOrRegexp.kind_of?(Regexp)
|
12
|
+
@string = aStringOrRegexp.source
|
13
|
+
@string = "^(" + @string + ")" unless @string[0,1] == "^"
|
14
|
+
super(@string, aStringOrRegexp.options)
|
15
|
+
else
|
16
|
+
raise ArgumentError
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def string
|
21
|
+
if @string =~ /\^\(.*\)/n
|
22
|
+
@string[2...-1]
|
23
|
+
else
|
24
|
+
@string
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Short hands for composing token regexp's
|
30
|
+
def tr(aStringOrRegexp)
|
31
|
+
aStringOrRegexp = aStringOrRegexp.source if aStringOrRegexp.class == Regexp
|
32
|
+
TokenRegexp.new(aStringOrRegexp)
|
33
|
+
end
|
34
|
+
|
35
|
+
def tre_compose(tokens, map, separator)
|
36
|
+
str = (map % tokens[0].string)
|
37
|
+
tokens[1..-1].each {|token| str += separator + (map % token.string)}
|
38
|
+
tr(str)
|
39
|
+
end
|
40
|
+
|
41
|
+
def ror(*tokens)
|
42
|
+
tre_compose(tokens, "(%s)", "|")
|
43
|
+
end
|
44
|
+
|
45
|
+
def rseq(*tokens)
|
46
|
+
tre_compose(tokens, "(%s)", "")
|
47
|
+
end
|
48
|
+
|
49
|
+
def r?(tokenregexp)
|
50
|
+
tr("(%s)?" % tokenregexp.string)
|
51
|
+
end
|
52
|
+
|
53
|
+
def rm(tokenregexp)
|
54
|
+
tr("(%s)*" % tokenregexp.string)
|
55
|
+
end
|
56
|
+
|
57
|
+
def rp(tokenregexp)
|
58
|
+
tr("(%s)+" % tokenregexp.string)
|
59
|
+
end
|
60
|
+
|
61
|
+
class Token
|
62
|
+
include SourceCodeDumpable
|
63
|
+
attr_reader :skip, :regexp
|
64
|
+
attr_accessor :name
|
65
|
+
|
66
|
+
def initialize(aString, aStringOrRegexpOrTokenRegexp = "", *options)
|
67
|
+
@name, @regexp = aString, TokenRegexp.new(aStringOrRegexpOrTokenRegexp)
|
68
|
+
parse_options(options)
|
69
|
+
end
|
70
|
+
|
71
|
+
def hash
|
72
|
+
@hashvalue || (@hashvalue = [self.class, @name, @regexp, @skip].hash)
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_options(options)
|
76
|
+
option_names = options.map do |option|
|
77
|
+
if option.kind_of? Symbol
|
78
|
+
option.id2name.downcase
|
79
|
+
else
|
80
|
+
option.downcase
|
81
|
+
end
|
82
|
+
end
|
83
|
+
@skip = true if option_names.include? "skip"
|
84
|
+
end
|
85
|
+
|
86
|
+
def ==(other)
|
87
|
+
other.class == self.class and
|
88
|
+
other.name == name and
|
89
|
+
other.regexp.inspect == regexp.inspect and
|
90
|
+
other.skip == skip
|
91
|
+
end
|
92
|
+
|
93
|
+
def match(aString)
|
94
|
+
@regexp.match aString
|
95
|
+
end
|
96
|
+
|
97
|
+
def value(lexeme)
|
98
|
+
# TODO: Add blocks that map lexeme's to values.
|
99
|
+
lexeme
|
100
|
+
end
|
101
|
+
|
102
|
+
def create_tree(lexeme, position)
|
103
|
+
t = SyntaxTree.new(@name, ["lexeme", "value"], [value(lexeme), lexeme])
|
104
|
+
t.attributes[:position] = position
|
105
|
+
t
|
106
|
+
end
|
107
|
+
|
108
|
+
def to_src(assignToName = nil, nameHash = {})
|
109
|
+
if skip
|
110
|
+
assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src), :Skip))
|
111
|
+
else
|
112
|
+
assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src)))
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def inspect
|
117
|
+
name || regexp.inspect
|
118
|
+
#osrc = options_to_src
|
119
|
+
#"#{name} = #{regexp.inspect} #{osrc.length>0 ? '['+osrc+']' : ''}"
|
120
|
+
end
|
121
|
+
|
122
|
+
protected
|
123
|
+
|
124
|
+
def options_to_src
|
125
|
+
if skip
|
126
|
+
":Skip"
|
127
|
+
else
|
128
|
+
""
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class EofToken < Token
|
134
|
+
def initialize(*args)
|
135
|
+
# Shouldn't match anything but since I'm not sure how to do a regexp
|
136
|
+
# with that chareacteristic we use a highly unlikely string in the mean
|
137
|
+
# time.
|
138
|
+
super("EOF", "�~~��~^^~" + rand(1e10).inspect)
|
139
|
+
end
|
140
|
+
|
141
|
+
def ==(other)
|
142
|
+
other.class == self.class
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
class EpsilonToken < Token
|
147
|
+
def initialize
|
148
|
+
# Shouldn't match anything but since I'm not sure how to do a regexp
|
149
|
+
# with that chareacteristic we use a highly unlikely string in the mean
|
150
|
+
# time.
|
151
|
+
super("epsilon", "�~~��~^^~" + rand(1e10).inspect)
|
152
|
+
end
|
153
|
+
|
154
|
+
def ==(other)
|
155
|
+
other.class == self.class
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class StringToken < Token
|
160
|
+
def initialize(name, string = name)
|
161
|
+
@string = string
|
162
|
+
super(name, Regexp.escape(string))
|
163
|
+
end
|
164
|
+
|
165
|
+
def to_src(assignToName = nil, nameHash = {})
|
166
|
+
assign_to(assignToName, new_of_my_type(name, @string))
|
167
|
+
end
|
168
|
+
|
169
|
+
def hash
|
170
|
+
@hashvalue || (@hashvalue = [self.class, @name, @string].hash)
|
171
|
+
end
|
172
|
+
|
173
|
+
def to_s
|
174
|
+
"#{id} #{@string} #{name.inspect} #{hash}"
|
175
|
+
end
|
176
|
+
|
177
|
+
def inspect
|
178
|
+
@string.inspect
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def string_token(string)
|
183
|
+
StringToken.new("StrToken" + string.hash.inspect, string)
|
184
|
+
end
|
185
|
+
|
186
|
+
class RegexpToken < Token
|
187
|
+
def initialize(aString, regexp, *options)
|
188
|
+
@name, @regexp = aString, regexp
|
189
|
+
parse_options(options)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def regexp_token(regexp, *options)
|
194
|
+
RegexpToken.new("RegexpToken" + regexp.hash.inspect, regexp, *options)
|
195
|
+
end
|
196
|
+
|
197
|
+
def t(name, re, *options)
|
198
|
+
if re.class == String
|
199
|
+
StringToken.new("StrToken" + re.hash.inspect, re)
|
200
|
+
else
|
201
|
+
Token.new(name, re, *options)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
require 'rpdf2txt-rockit/stringscanner' # DO *NOT* alter since install.rb exploits formatting
|
206
|
+
|
207
|
+
# Forking lexers return LexerToken's with the info about a matching token
|
208
|
+
# and the lexer to access for next tokens.
|
209
|
+
class LexerToken
|
210
|
+
attr_reader :lexeme, :token_type, :lexer, :position
|
211
|
+
|
212
|
+
def initialize(lexeme, tokenType, lexer, position = nil)
|
213
|
+
@lexeme, @token_type, @lexer = lexeme, tokenType, lexer
|
214
|
+
@position = position
|
215
|
+
end
|
216
|
+
|
217
|
+
def create_tree
|
218
|
+
@token_type.create_tree(@lexeme, @position)
|
219
|
+
end
|
220
|
+
|
221
|
+
def inspect
|
222
|
+
"LT(#{lexeme.inspect}, #{token_type.name})"
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
class LexerPosition
|
227
|
+
attr_reader :row, :column, :char_position
|
228
|
+
|
229
|
+
def initialize(row = 0, column = 0, char_position = 0)
|
230
|
+
@row, @column, @char_position = row, column, char_position
|
231
|
+
end
|
232
|
+
|
233
|
+
def +(aString)
|
234
|
+
char_position = @char_position + aString.length
|
235
|
+
num_newlines = aString.count "\r\n"
|
236
|
+
row = @row
|
237
|
+
if num_newlines == 0
|
238
|
+
column = @column + aString.length
|
239
|
+
else
|
240
|
+
row += num_newlines
|
241
|
+
begin
|
242
|
+
column = aString.split("\n").last.split("\r").last.length
|
243
|
+
rescue NameError
|
244
|
+
column = 0
|
245
|
+
end
|
246
|
+
end
|
247
|
+
LexerPosition.new(row, column, char_position)
|
248
|
+
end
|
249
|
+
|
250
|
+
def inspect
|
251
|
+
"(row=#{row},column=#{@column})"
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
# NOTE: If more performance is needed it might be good to use one char of
|
256
|
+
# lookahead to group tokens and reduce the number of tokens that needs to
|
257
|
+
# be tested.
|
258
|
+
class ForkingRegexpLexer
|
259
|
+
attr_accessor :position
|
260
|
+
attr_reader :scanner, :tokens, :lexer_cache, :eof_token
|
261
|
+
protected :lexer_cache
|
262
|
+
|
263
|
+
def initialize(tokens, eofToken = nil)
|
264
|
+
@tokens = tokens
|
265
|
+
@eof_token = tokens.detect {|t| t.kind_of?(EofToken)}
|
266
|
+
@tokens.delete_if {|t| t.kind_of?(EofToken)}
|
267
|
+
end
|
268
|
+
|
269
|
+
@@eof_token = EofToken.new
|
270
|
+
|
271
|
+
def init(aString)
|
272
|
+
@position, @current_tokens = LexerPosition.new, nil
|
273
|
+
@scanner = StringScanner.new(aString)
|
274
|
+
|
275
|
+
# We speed things up by only having one lexer at each position. Since there
|
276
|
+
# are typically only a small number of positions we use a BoundedLruCache
|
277
|
+
# of size 20 to keep them in. The cache throws out oldest (least recently
|
278
|
+
# used, NOTE! accessed in the cache not used in the parser) lexer when
|
279
|
+
# new one inserted. This is to keep the memory consumption down.
|
280
|
+
#
|
281
|
+
@lexer_cache = BoundedLruCache.new(20)
|
282
|
+
end
|
283
|
+
|
284
|
+
# Refactor! Complex interactions when tokens are skipped since the next_lexer
|
285
|
+
# update "our" scanner. Find cleaner way of expressing this!
|
286
|
+
def peek
|
287
|
+
return @current_tokens if @current_tokens
|
288
|
+
scanner.pointer = @position.char_position
|
289
|
+
@current_tokens = Array.new
|
290
|
+
tokens.each do |token|
|
291
|
+
if (match = scanner.check(token.regexp))
|
292
|
+
if token.skip
|
293
|
+
# Token to be skipped => return tokens matching after the skipped one
|
294
|
+
@current_tokens.concat next_lexer(match).peek
|
295
|
+
scanner.pointer = @position.char_position
|
296
|
+
else
|
297
|
+
@current_tokens.push LexerToken.new(match, token,
|
298
|
+
next_lexer(match), @position)
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
if @current_tokens.length == 0
|
303
|
+
@string_length = scanner.string.length unless @string_length
|
304
|
+
if @position.char_position >= @string_length
|
305
|
+
@current_tokens.push LexerToken.new(nil, eof_token || @@eof_token,
|
306
|
+
nil, @position)
|
307
|
+
end
|
308
|
+
end
|
309
|
+
return @current_tokens
|
310
|
+
end
|
311
|
+
|
312
|
+
def inspect
|
313
|
+
"Lexer(#{@position.inspect})"
|
314
|
+
end
|
315
|
+
|
316
|
+
protected
|
317
|
+
|
318
|
+
def next_lexer(matchingString)
|
319
|
+
pos = @position + matchingString
|
320
|
+
#create_next_lexer(pos)
|
321
|
+
char_pos = pos.char_position
|
322
|
+
lexer = self.lexer_cache[char_pos]
|
323
|
+
self.lexer_cache[char_pos] = lexer = create_next_lexer(pos) unless lexer
|
324
|
+
lexer
|
325
|
+
end
|
326
|
+
|
327
|
+
|
328
|
+
def create_next_lexer(pos)
|
329
|
+
ReferencingRegexpLexer.new(self, pos)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
class ReferencingRegexpLexer < ForkingRegexpLexer
|
334
|
+
def initialize(aForkingRegexpLexer, position)
|
335
|
+
@parent_lexer, @position = aForkingRegexpLexer, position
|
336
|
+
end
|
337
|
+
|
338
|
+
def inspect
|
339
|
+
"RefLexer(#{@position.inspect})"
|
340
|
+
end
|
341
|
+
|
342
|
+
def scanner
|
343
|
+
@parent_lexer.scanner
|
344
|
+
end
|
345
|
+
|
346
|
+
protected
|
347
|
+
|
348
|
+
def create_next_lexer(pos)
|
349
|
+
ReferencingRegexpLexer.new(@parent_lexer, pos)
|
350
|
+
end
|
351
|
+
|
352
|
+
def lexer_cache
|
353
|
+
@parent_lexer.lexer_cache
|
354
|
+
end
|
355
|
+
|
356
|
+
def eof_token
|
357
|
+
@parent_lexer.eof_token
|
358
|
+
end
|
359
|
+
|
360
|
+
def tokens
|
361
|
+
@parent_lexer.tokens
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# AttributesParser -- Rpdf2txt -- 19.12.2002 -- hwyss@ywesee.com
|
24
|
+
|
25
|
+
require 'rpdf2txt-rockit/rockit'
|
26
|
+
|
27
|
+
module Rpdf2txt
|
28
|
+
GRAMMAR_PATH = File.expand_path('data/pdfattributes.grammar', File.dirname(__FILE__))
|
29
|
+
PARSER_PATH = File.expand_path('data/pdfattributes.rb', File.dirname(__FILE__))
|
30
|
+
def attributes_parser(grammar_path=GRAMMAR_PATH, parser_path=PARSER_PATH)
|
31
|
+
oldpath = File.dirname(grammar_path) << "/_" << File.basename(grammar_path)
|
32
|
+
src = File.read(grammar_path)
|
33
|
+
unless(File.exists?(oldpath) && File.read(oldpath)==src)
|
34
|
+
File.delete(oldpath) if File.exists?(oldpath)
|
35
|
+
Parse.generate_parser_from_file_to_file(grammar_path, parser_path, '_attr_parser', 'Rpdf2txt')
|
36
|
+
File.open(oldpath, 'w') { |f| f << src }
|
37
|
+
end
|
38
|
+
require parser_path
|
39
|
+
Rpdf2txt._attr_parser
|
40
|
+
end
|
41
|
+
module_function :attributes_parser
|
42
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# TextParser -- Rpdf2txt -- 04.11.2004 -- mwalder@ywesee.com
|
24
|
+
# rwaltert@ywesee.com
|
25
|
+
|
26
|
+
require 'rpdf2txt-rockit/rockit'
|
27
|
+
|
28
|
+
module Rpdf2txt
|
29
|
+
CMAP_GRAMMAR = File.expand_path('data/cmap.grammar',
|
30
|
+
File.dirname(__FILE__))
|
31
|
+
CMAP_PARSER = File.expand_path('data/cmap.rb',
|
32
|
+
File.dirname(__FILE__))
|
33
|
+
CMAP_RANGE_GRAMMAR = File.expand_path('data/cmap_range.grammar',
|
34
|
+
File.dirname(__FILE__))
|
35
|
+
CMAP_RANGE_PARSER = File.expand_path('data/cmap_range.rb',
|
36
|
+
File.dirname(__FILE__))
|
37
|
+
def Rpdf2txt.cmap_parser(grammar_path=CMAP_GRAMMAR,
|
38
|
+
parser_path=CMAP_PARSER)
|
39
|
+
oldpath = File.dirname(grammar_path) \
|
40
|
+
<< "/_" << File.basename(grammar_path)
|
41
|
+
src = File.read(grammar_path)
|
42
|
+
unless(File.exists?(oldpath) && File.read(oldpath)==src)
|
43
|
+
File.delete(oldpath) if File.exists?(oldpath)
|
44
|
+
Parse.generate_parser_from_file_to_file(grammar_path,
|
45
|
+
parser_path, '_cmap_parser', 'Rpdf2txt')
|
46
|
+
File.open(oldpath, 'w') { |f| f << src }
|
47
|
+
end
|
48
|
+
require parser_path
|
49
|
+
Rpdf2txt._cmap_parser
|
50
|
+
end
|
51
|
+
def Rpdf2txt.cmap_range_parser(grammar_path=CMAP_RANGE_GRAMMAR,
|
52
|
+
parser_path=CMAP_RANGE_PARSER)
|
53
|
+
oldpath = File.dirname(grammar_path) \
|
54
|
+
<< "/_" << File.basename(grammar_path)
|
55
|
+
src = File.read(grammar_path)
|
56
|
+
unless(File.exists?(oldpath) && File.read(oldpath)==src)
|
57
|
+
File.delete(oldpath) if File.exists?(oldpath)
|
58
|
+
Parse.generate_parser_from_file_to_file(grammar_path,
|
59
|
+
parser_path, '_cmap_range_parser', 'Rpdf2txt')
|
60
|
+
File.open(oldpath, 'w') { |f| f << src }
|
61
|
+
end
|
62
|
+
require parser_path
|
63
|
+
Rpdf2txt._cmap_range_parser
|
64
|
+
end
|
65
|
+
end
|