rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
require 'rpdf2txt-rockit/syntax_tree'
|
|
2
|
+
require 'rpdf2txt-rockit/sourcecode_dumpable'
|
|
3
|
+
require 'rpdf2txt-rockit/bounded_lru_cache'
|
|
4
|
+
|
|
5
|
+
class TokenRegexp < Regexp
|
|
6
|
+
def initialize(aStringOrRegexp)
|
|
7
|
+
if aStringOrRegexp.class == String
|
|
8
|
+
@string = aStringOrRegexp
|
|
9
|
+
@string = "^(" + @string + ")" unless @string[0,1] == "^"
|
|
10
|
+
super @string
|
|
11
|
+
elsif aStringOrRegexp.kind_of?(Regexp)
|
|
12
|
+
@string = aStringOrRegexp.source
|
|
13
|
+
@string = "^(" + @string + ")" unless @string[0,1] == "^"
|
|
14
|
+
super(@string, aStringOrRegexp.options)
|
|
15
|
+
else
|
|
16
|
+
raise ArgumentError
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def string
|
|
21
|
+
if @string =~ /\^\(.*\)/n
|
|
22
|
+
@string[2...-1]
|
|
23
|
+
else
|
|
24
|
+
@string
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Short hands for composing token regexp's
|
|
30
|
+
def tr(aStringOrRegexp)
|
|
31
|
+
aStringOrRegexp = aStringOrRegexp.source if aStringOrRegexp.class == Regexp
|
|
32
|
+
TokenRegexp.new(aStringOrRegexp)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def tre_compose(tokens, map, separator)
|
|
36
|
+
str = (map % tokens[0].string)
|
|
37
|
+
tokens[1..-1].each {|token| str += separator + (map % token.string)}
|
|
38
|
+
tr(str)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def ror(*tokens)
|
|
42
|
+
tre_compose(tokens, "(%s)", "|")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def rseq(*tokens)
|
|
46
|
+
tre_compose(tokens, "(%s)", "")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def r?(tokenregexp)
|
|
50
|
+
tr("(%s)?" % tokenregexp.string)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def rm(tokenregexp)
|
|
54
|
+
tr("(%s)*" % tokenregexp.string)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def rp(tokenregexp)
|
|
58
|
+
tr("(%s)+" % tokenregexp.string)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
class Token
|
|
62
|
+
include SourceCodeDumpable
|
|
63
|
+
attr_reader :skip, :regexp
|
|
64
|
+
attr_accessor :name
|
|
65
|
+
|
|
66
|
+
def initialize(aString, aStringOrRegexpOrTokenRegexp = "", *options)
|
|
67
|
+
@name, @regexp = aString, TokenRegexp.new(aStringOrRegexpOrTokenRegexp)
|
|
68
|
+
parse_options(options)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def hash
|
|
72
|
+
@hashvalue || (@hashvalue = [self.class, @name, @regexp, @skip].hash)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def parse_options(options)
|
|
76
|
+
option_names = options.map do |option|
|
|
77
|
+
if option.kind_of? Symbol
|
|
78
|
+
option.id2name.downcase
|
|
79
|
+
else
|
|
80
|
+
option.downcase
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
@skip = true if option_names.include? "skip"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def ==(other)
|
|
87
|
+
other.class == self.class and
|
|
88
|
+
other.name == name and
|
|
89
|
+
other.regexp.inspect == regexp.inspect and
|
|
90
|
+
other.skip == skip
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def match(aString)
|
|
94
|
+
@regexp.match aString
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def value(lexeme)
|
|
98
|
+
# TODO: Add blocks that map lexeme's to values.
|
|
99
|
+
lexeme
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def create_tree(lexeme, position)
|
|
103
|
+
t = SyntaxTree.new(@name, ["lexeme", "value"], [value(lexeme), lexeme])
|
|
104
|
+
t.attributes[:position] = position
|
|
105
|
+
t
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def to_src(assignToName = nil, nameHash = {})
|
|
109
|
+
if skip
|
|
110
|
+
assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src), :Skip))
|
|
111
|
+
else
|
|
112
|
+
assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src)))
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def inspect
|
|
117
|
+
name || regexp.inspect
|
|
118
|
+
#osrc = options_to_src
|
|
119
|
+
#"#{name} = #{regexp.inspect} #{osrc.length>0 ? '['+osrc+']' : ''}"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
protected
|
|
123
|
+
|
|
124
|
+
def options_to_src
|
|
125
|
+
if skip
|
|
126
|
+
":Skip"
|
|
127
|
+
else
|
|
128
|
+
""
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
class EofToken < Token
|
|
134
|
+
def initialize(*args)
|
|
135
|
+
# Shouldn't match anything but since I'm not sure how to do a regexp
|
|
136
|
+
# with that chareacteristic we use a highly unlikely string in the mean
|
|
137
|
+
# time.
|
|
138
|
+
super("EOF", "�~~��~^^~" + rand(1e10).inspect)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def ==(other)
|
|
142
|
+
other.class == self.class
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
class EpsilonToken < Token
|
|
147
|
+
def initialize
|
|
148
|
+
# Shouldn't match anything but since I'm not sure how to do a regexp
|
|
149
|
+
# with that chareacteristic we use a highly unlikely string in the mean
|
|
150
|
+
# time.
|
|
151
|
+
super("epsilon", "�~~��~^^~" + rand(1e10).inspect)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def ==(other)
|
|
155
|
+
other.class == self.class
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
class StringToken < Token
|
|
160
|
+
def initialize(name, string = name)
|
|
161
|
+
@string = string
|
|
162
|
+
super(name, Regexp.escape(string))
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def to_src(assignToName = nil, nameHash = {})
|
|
166
|
+
assign_to(assignToName, new_of_my_type(name, @string))
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def hash
|
|
170
|
+
@hashvalue || (@hashvalue = [self.class, @name, @string].hash)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def to_s
|
|
174
|
+
"#{id} #{@string} #{name.inspect} #{hash}"
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def inspect
|
|
178
|
+
@string.inspect
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def string_token(string)
|
|
183
|
+
StringToken.new("StrToken" + string.hash.inspect, string)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
class RegexpToken < Token
|
|
187
|
+
def initialize(aString, regexp, *options)
|
|
188
|
+
@name, @regexp = aString, regexp
|
|
189
|
+
parse_options(options)
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def regexp_token(regexp, *options)
|
|
194
|
+
RegexpToken.new("RegexpToken" + regexp.hash.inspect, regexp, *options)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def t(name, re, *options)
|
|
198
|
+
if re.class == String
|
|
199
|
+
StringToken.new("StrToken" + re.hash.inspect, re)
|
|
200
|
+
else
|
|
201
|
+
Token.new(name, re, *options)
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
require 'rpdf2txt-rockit/stringscanner' # DO *NOT* alter since install.rb exploits formatting
|
|
206
|
+
|
|
207
|
+
# Forking lexers return LexerToken's with the info about a matching token
|
|
208
|
+
# and the lexer to access for next tokens.
|
|
209
|
+
class LexerToken
|
|
210
|
+
attr_reader :lexeme, :token_type, :lexer, :position
|
|
211
|
+
|
|
212
|
+
def initialize(lexeme, tokenType, lexer, position = nil)
|
|
213
|
+
@lexeme, @token_type, @lexer = lexeme, tokenType, lexer
|
|
214
|
+
@position = position
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def create_tree
|
|
218
|
+
@token_type.create_tree(@lexeme, @position)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def inspect
|
|
222
|
+
"LT(#{lexeme.inspect}, #{token_type.name})"
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
class LexerPosition
|
|
227
|
+
attr_reader :row, :column, :char_position
|
|
228
|
+
|
|
229
|
+
def initialize(row = 0, column = 0, char_position = 0)
|
|
230
|
+
@row, @column, @char_position = row, column, char_position
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def +(aString)
|
|
234
|
+
char_position = @char_position + aString.length
|
|
235
|
+
num_newlines = aString.count "\r\n"
|
|
236
|
+
row = @row
|
|
237
|
+
if num_newlines == 0
|
|
238
|
+
column = @column + aString.length
|
|
239
|
+
else
|
|
240
|
+
row += num_newlines
|
|
241
|
+
begin
|
|
242
|
+
column = aString.split("\n").last.split("\r").last.length
|
|
243
|
+
rescue NameError
|
|
244
|
+
column = 0
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
LexerPosition.new(row, column, char_position)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def inspect
|
|
251
|
+
"(row=#{row},column=#{@column})"
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# NOTE: If more performance is needed it might be good to use one char of
|
|
256
|
+
# lookahead to group tokens and reduce the number of tokens that needs to
|
|
257
|
+
# be tested.
|
|
258
|
+
class ForkingRegexpLexer
|
|
259
|
+
attr_accessor :position
|
|
260
|
+
attr_reader :scanner, :tokens, :lexer_cache, :eof_token
|
|
261
|
+
protected :lexer_cache
|
|
262
|
+
|
|
263
|
+
def initialize(tokens, eofToken = nil)
|
|
264
|
+
@tokens = tokens
|
|
265
|
+
@eof_token = tokens.detect {|t| t.kind_of?(EofToken)}
|
|
266
|
+
@tokens.delete_if {|t| t.kind_of?(EofToken)}
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
@@eof_token = EofToken.new
|
|
270
|
+
|
|
271
|
+
def init(aString)
|
|
272
|
+
@position, @current_tokens = LexerPosition.new, nil
|
|
273
|
+
@scanner = StringScanner.new(aString)
|
|
274
|
+
|
|
275
|
+
# We speed things up by only having one lexer at each position. Since there
|
|
276
|
+
# are typically only a small number of positions we use a BoundedLruCache
|
|
277
|
+
# of size 20 to keep them in. The cache throws out oldest (least recently
|
|
278
|
+
# used, NOTE! accessed in the cache not used in the parser) lexer when
|
|
279
|
+
# new one inserted. This is to keep the memory consumption down.
|
|
280
|
+
#
|
|
281
|
+
@lexer_cache = BoundedLruCache.new(20)
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Refactor! Complex interactions when tokens are skipped since the next_lexer
|
|
285
|
+
# update "our" scanner. Find cleaner way of expressing this!
|
|
286
|
+
def peek
|
|
287
|
+
return @current_tokens if @current_tokens
|
|
288
|
+
scanner.pointer = @position.char_position
|
|
289
|
+
@current_tokens = Array.new
|
|
290
|
+
tokens.each do |token|
|
|
291
|
+
if (match = scanner.check(token.regexp))
|
|
292
|
+
if token.skip
|
|
293
|
+
# Token to be skipped => return tokens matching after the skipped one
|
|
294
|
+
@current_tokens.concat next_lexer(match).peek
|
|
295
|
+
scanner.pointer = @position.char_position
|
|
296
|
+
else
|
|
297
|
+
@current_tokens.push LexerToken.new(match, token,
|
|
298
|
+
next_lexer(match), @position)
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
if @current_tokens.length == 0
|
|
303
|
+
@string_length = scanner.string.length unless @string_length
|
|
304
|
+
if @position.char_position >= @string_length
|
|
305
|
+
@current_tokens.push LexerToken.new(nil, eof_token || @@eof_token,
|
|
306
|
+
nil, @position)
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
return @current_tokens
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
def inspect
|
|
313
|
+
"Lexer(#{@position.inspect})"
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
protected
|
|
317
|
+
|
|
318
|
+
def next_lexer(matchingString)
|
|
319
|
+
pos = @position + matchingString
|
|
320
|
+
#create_next_lexer(pos)
|
|
321
|
+
char_pos = pos.char_position
|
|
322
|
+
lexer = self.lexer_cache[char_pos]
|
|
323
|
+
self.lexer_cache[char_pos] = lexer = create_next_lexer(pos) unless lexer
|
|
324
|
+
lexer
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def create_next_lexer(pos)
|
|
329
|
+
ReferencingRegexpLexer.new(self, pos)
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
class ReferencingRegexpLexer < ForkingRegexpLexer
|
|
334
|
+
def initialize(aForkingRegexpLexer, position)
|
|
335
|
+
@parent_lexer, @position = aForkingRegexpLexer, position
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
def inspect
|
|
339
|
+
"RefLexer(#{@position.inspect})"
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def scanner
|
|
343
|
+
@parent_lexer.scanner
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
protected
|
|
347
|
+
|
|
348
|
+
def create_next_lexer(pos)
|
|
349
|
+
ReferencingRegexpLexer.new(@parent_lexer, pos)
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def lexer_cache
|
|
353
|
+
@parent_lexer.lexer_cache
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
def eof_token
|
|
357
|
+
@parent_lexer.eof_token
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def tokens
|
|
361
|
+
@parent_lexer.tokens
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
|
22
|
+
#
|
|
23
|
+
# AttributesParser -- Rpdf2txt -- 19.12.2002 -- hwyss@ywesee.com
|
|
24
|
+
|
|
25
|
+
require 'rpdf2txt-rockit/rockit'
|
|
26
|
+
|
|
27
|
+
module Rpdf2txt
|
|
28
|
+
GRAMMAR_PATH = File.expand_path('data/pdfattributes.grammar', File.dirname(__FILE__))
|
|
29
|
+
PARSER_PATH = File.expand_path('data/pdfattributes.rb', File.dirname(__FILE__))
|
|
30
|
+
def attributes_parser(grammar_path=GRAMMAR_PATH, parser_path=PARSER_PATH)
|
|
31
|
+
oldpath = File.dirname(grammar_path) << "/_" << File.basename(grammar_path)
|
|
32
|
+
src = File.read(grammar_path)
|
|
33
|
+
unless(File.exists?(oldpath) && File.read(oldpath)==src)
|
|
34
|
+
File.delete(oldpath) if File.exists?(oldpath)
|
|
35
|
+
Parse.generate_parser_from_file_to_file(grammar_path, parser_path, '_attr_parser', 'Rpdf2txt')
|
|
36
|
+
File.open(oldpath, 'w') { |f| f << src }
|
|
37
|
+
end
|
|
38
|
+
require parser_path
|
|
39
|
+
Rpdf2txt._attr_parser
|
|
40
|
+
end
|
|
41
|
+
module_function :attributes_parser
|
|
42
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
|
5
|
+
#
|
|
6
|
+
# This library is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
# License as published by the Free Software Foundation; either
|
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
# Lesser General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
# License along with this library; if not, write to the Free Software
|
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
#
|
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
|
22
|
+
#
|
|
23
|
+
# TextParser -- Rpdf2txt -- 04.11.2004 -- mwalder@ywesee.com
|
|
24
|
+
# rwaltert@ywesee.com
|
|
25
|
+
|
|
26
|
+
require 'rpdf2txt-rockit/rockit'
|
|
27
|
+
|
|
28
|
+
module Rpdf2txt
|
|
29
|
+
CMAP_GRAMMAR = File.expand_path('data/cmap.grammar',
|
|
30
|
+
File.dirname(__FILE__))
|
|
31
|
+
CMAP_PARSER = File.expand_path('data/cmap.rb',
|
|
32
|
+
File.dirname(__FILE__))
|
|
33
|
+
CMAP_RANGE_GRAMMAR = File.expand_path('data/cmap_range.grammar',
|
|
34
|
+
File.dirname(__FILE__))
|
|
35
|
+
CMAP_RANGE_PARSER = File.expand_path('data/cmap_range.rb',
|
|
36
|
+
File.dirname(__FILE__))
|
|
37
|
+
def Rpdf2txt.cmap_parser(grammar_path=CMAP_GRAMMAR,
|
|
38
|
+
parser_path=CMAP_PARSER)
|
|
39
|
+
oldpath = File.dirname(grammar_path) \
|
|
40
|
+
<< "/_" << File.basename(grammar_path)
|
|
41
|
+
src = File.read(grammar_path)
|
|
42
|
+
unless(File.exists?(oldpath) && File.read(oldpath)==src)
|
|
43
|
+
File.delete(oldpath) if File.exists?(oldpath)
|
|
44
|
+
Parse.generate_parser_from_file_to_file(grammar_path,
|
|
45
|
+
parser_path, '_cmap_parser', 'Rpdf2txt')
|
|
46
|
+
File.open(oldpath, 'w') { |f| f << src }
|
|
47
|
+
end
|
|
48
|
+
require parser_path
|
|
49
|
+
Rpdf2txt._cmap_parser
|
|
50
|
+
end
|
|
51
|
+
def Rpdf2txt.cmap_range_parser(grammar_path=CMAP_RANGE_GRAMMAR,
|
|
52
|
+
parser_path=CMAP_RANGE_PARSER)
|
|
53
|
+
oldpath = File.dirname(grammar_path) \
|
|
54
|
+
<< "/_" << File.basename(grammar_path)
|
|
55
|
+
src = File.read(grammar_path)
|
|
56
|
+
unless(File.exists?(oldpath) && File.read(oldpath)==src)
|
|
57
|
+
File.delete(oldpath) if File.exists?(oldpath)
|
|
58
|
+
Parse.generate_parser_from_file_to_file(grammar_path,
|
|
59
|
+
parser_path, '_cmap_range_parser', 'Rpdf2txt')
|
|
60
|
+
File.open(oldpath, 'w') { |f| f << src }
|
|
61
|
+
end
|
|
62
|
+
require parser_path
|
|
63
|
+
Rpdf2txt._cmap_range_parser
|
|
64
|
+
end
|
|
65
|
+
end
|