rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,364 @@
1
+ require 'rpdf2txt-rockit/syntax_tree'
2
+ require 'rpdf2txt-rockit/sourcecode_dumpable'
3
+ require 'rpdf2txt-rockit/bounded_lru_cache'
4
+
5
+ class TokenRegexp < Regexp
6
+ def initialize(aStringOrRegexp)
7
+ if aStringOrRegexp.class == String
8
+ @string = aStringOrRegexp
9
+ @string = "^(" + @string + ")" unless @string[0,1] == "^"
10
+ super @string
11
+ elsif aStringOrRegexp.kind_of?(Regexp)
12
+ @string = aStringOrRegexp.source
13
+ @string = "^(" + @string + ")" unless @string[0,1] == "^"
14
+ super(@string, aStringOrRegexp.options)
15
+ else
16
+ raise ArgumentError
17
+ end
18
+ end
19
+
20
+ def string
21
+ if @string =~ /\^\(.*\)/n
22
+ @string[2...-1]
23
+ else
24
+ @string
25
+ end
26
+ end
27
+ end
28
+
29
+ # Short hands for composing token regexp's
30
+ def tr(aStringOrRegexp)
31
+ aStringOrRegexp = aStringOrRegexp.source if aStringOrRegexp.class == Regexp
32
+ TokenRegexp.new(aStringOrRegexp)
33
+ end
34
+
35
+ def tre_compose(tokens, map, separator)
36
+ str = (map % tokens[0].string)
37
+ tokens[1..-1].each {|token| str += separator + (map % token.string)}
38
+ tr(str)
39
+ end
40
+
41
+ def ror(*tokens)
42
+ tre_compose(tokens, "(%s)", "|")
43
+ end
44
+
45
+ def rseq(*tokens)
46
+ tre_compose(tokens, "(%s)", "")
47
+ end
48
+
49
+ def r?(tokenregexp)
50
+ tr("(%s)?" % tokenregexp.string)
51
+ end
52
+
53
+ def rm(tokenregexp)
54
+ tr("(%s)*" % tokenregexp.string)
55
+ end
56
+
57
+ def rp(tokenregexp)
58
+ tr("(%s)+" % tokenregexp.string)
59
+ end
60
+
61
+ class Token
62
+ include SourceCodeDumpable
63
+ attr_reader :skip, :regexp
64
+ attr_accessor :name
65
+
66
+ def initialize(aString, aStringOrRegexpOrTokenRegexp = "", *options)
67
+ @name, @regexp = aString, TokenRegexp.new(aStringOrRegexpOrTokenRegexp)
68
+ parse_options(options)
69
+ end
70
+
71
+ def hash
72
+ @hashvalue || (@hashvalue = [self.class, @name, @regexp, @skip].hash)
73
+ end
74
+
75
+ def parse_options(options)
76
+ option_names = options.map do |option|
77
+ if option.kind_of? Symbol
78
+ option.id2name.downcase
79
+ else
80
+ option.downcase
81
+ end
82
+ end
83
+ @skip = true if option_names.include? "skip"
84
+ end
85
+
86
+ def ==(other)
87
+ other.class == self.class and
88
+ other.name == name and
89
+ other.regexp.inspect == regexp.inspect and
90
+ other.skip == skip
91
+ end
92
+
93
+ def match(aString)
94
+ @regexp.match aString
95
+ end
96
+
97
+ def value(lexeme)
98
+ # TODO: Add blocks that map lexeme's to values.
99
+ lexeme
100
+ end
101
+
102
+ def create_tree(lexeme, position)
103
+ t = SyntaxTree.new(@name, ["lexeme", "value"], [value(lexeme), lexeme])
104
+ t.attributes[:position] = position
105
+ t
106
+ end
107
+
108
+ def to_src(assignToName = nil, nameHash = {})
109
+ if skip
110
+ assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src), :Skip))
111
+ else
112
+ assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src)))
113
+ end
114
+ end
115
+
116
+ def inspect
117
+ name || regexp.inspect
118
+ #osrc = options_to_src
119
+ #"#{name} = #{regexp.inspect} #{osrc.length>0 ? '['+osrc+']' : ''}"
120
+ end
121
+
122
+ protected
123
+
124
+ def options_to_src
125
+ if skip
126
+ ":Skip"
127
+ else
128
+ ""
129
+ end
130
+ end
131
+ end
132
+
133
+ class EofToken < Token
134
+ def initialize(*args)
135
+ # Shouldn't match anything but since I'm not sure how to do a regexp
136
+ # with that chareacteristic we use a highly unlikely string in the mean
137
+ # time.
138
+ super("EOF", "�~~��~^^~" + rand(1e10).inspect)
139
+ end
140
+
141
+ def ==(other)
142
+ other.class == self.class
143
+ end
144
+ end
145
+
146
+ class EpsilonToken < Token
147
+ def initialize
148
+ # Shouldn't match anything but since I'm not sure how to do a regexp
149
+ # with that chareacteristic we use a highly unlikely string in the mean
150
+ # time.
151
+ super("epsilon", "�~~��~^^~" + rand(1e10).inspect)
152
+ end
153
+
154
+ def ==(other)
155
+ other.class == self.class
156
+ end
157
+ end
158
+
159
+ class StringToken < Token
160
+ def initialize(name, string = name)
161
+ @string = string
162
+ super(name, Regexp.escape(string))
163
+ end
164
+
165
+ def to_src(assignToName = nil, nameHash = {})
166
+ assign_to(assignToName, new_of_my_type(name, @string))
167
+ end
168
+
169
+ def hash
170
+ @hashvalue || (@hashvalue = [self.class, @name, @string].hash)
171
+ end
172
+
173
+ def to_s
174
+ "#{id} #{@string} #{name.inspect} #{hash}"
175
+ end
176
+
177
+ def inspect
178
+ @string.inspect
179
+ end
180
+ end
181
+
182
+ def string_token(string)
183
+ StringToken.new("StrToken" + string.hash.inspect, string)
184
+ end
185
+
186
+ class RegexpToken < Token
187
+ def initialize(aString, regexp, *options)
188
+ @name, @regexp = aString, regexp
189
+ parse_options(options)
190
+ end
191
+ end
192
+
193
+ def regexp_token(regexp, *options)
194
+ RegexpToken.new("RegexpToken" + regexp.hash.inspect, regexp, *options)
195
+ end
196
+
197
+ def t(name, re, *options)
198
+ if re.class == String
199
+ StringToken.new("StrToken" + re.hash.inspect, re)
200
+ else
201
+ Token.new(name, re, *options)
202
+ end
203
+ end
204
+
205
+ require 'rpdf2txt-rockit/stringscanner' # DO *NOT* alter since install.rb exploits formatting
206
+
207
+ # Forking lexers return LexerToken's with the info about a matching token
208
+ # and the lexer to access for next tokens.
209
+ class LexerToken
210
+ attr_reader :lexeme, :token_type, :lexer, :position
211
+
212
+ def initialize(lexeme, tokenType, lexer, position = nil)
213
+ @lexeme, @token_type, @lexer = lexeme, tokenType, lexer
214
+ @position = position
215
+ end
216
+
217
+ def create_tree
218
+ @token_type.create_tree(@lexeme, @position)
219
+ end
220
+
221
+ def inspect
222
+ "LT(#{lexeme.inspect}, #{token_type.name})"
223
+ end
224
+ end
225
+
226
+ class LexerPosition
227
+ attr_reader :row, :column, :char_position
228
+
229
+ def initialize(row = 0, column = 0, char_position = 0)
230
+ @row, @column, @char_position = row, column, char_position
231
+ end
232
+
233
+ def +(aString)
234
+ char_position = @char_position + aString.length
235
+ num_newlines = aString.count "\r\n"
236
+ row = @row
237
+ if num_newlines == 0
238
+ column = @column + aString.length
239
+ else
240
+ row += num_newlines
241
+ begin
242
+ column = aString.split("\n").last.split("\r").last.length
243
+ rescue NameError
244
+ column = 0
245
+ end
246
+ end
247
+ LexerPosition.new(row, column, char_position)
248
+ end
249
+
250
+ def inspect
251
+ "(row=#{row},column=#{@column})"
252
+ end
253
+ end
254
+
255
+ # NOTE: If more performance is needed it might be good to use one char of
256
+ # lookahead to group tokens and reduce the number of tokens that needs to
257
+ # be tested.
258
+ class ForkingRegexpLexer
259
+ attr_accessor :position
260
+ attr_reader :scanner, :tokens, :lexer_cache, :eof_token
261
+ protected :lexer_cache
262
+
263
+ def initialize(tokens, eofToken = nil)
264
+ @tokens = tokens
265
+ @eof_token = tokens.detect {|t| t.kind_of?(EofToken)}
266
+ @tokens.delete_if {|t| t.kind_of?(EofToken)}
267
+ end
268
+
269
+ @@eof_token = EofToken.new
270
+
271
+ def init(aString)
272
+ @position, @current_tokens = LexerPosition.new, nil
273
+ @scanner = StringScanner.new(aString)
274
+
275
+ # We speed things up by only having one lexer at each position. Since there
276
+ # are typically only a small number of positions we use a BoundedLruCache
277
+ # of size 20 to keep them in. The cache throws out oldest (least recently
278
+ # used, NOTE! accessed in the cache not used in the parser) lexer when
279
+ # new one inserted. This is to keep the memory consumption down.
280
+ #
281
+ @lexer_cache = BoundedLruCache.new(20)
282
+ end
283
+
284
+ # Refactor! Complex interactions when tokens are skipped since the next_lexer
285
+ # update "our" scanner. Find cleaner way of expressing this!
286
+ def peek
287
+ return @current_tokens if @current_tokens
288
+ scanner.pointer = @position.char_position
289
+ @current_tokens = Array.new
290
+ tokens.each do |token|
291
+ if (match = scanner.check(token.regexp))
292
+ if token.skip
293
+ # Token to be skipped => return tokens matching after the skipped one
294
+ @current_tokens.concat next_lexer(match).peek
295
+ scanner.pointer = @position.char_position
296
+ else
297
+ @current_tokens.push LexerToken.new(match, token,
298
+ next_lexer(match), @position)
299
+ end
300
+ end
301
+ end
302
+ if @current_tokens.length == 0
303
+ @string_length = scanner.string.length unless @string_length
304
+ if @position.char_position >= @string_length
305
+ @current_tokens.push LexerToken.new(nil, eof_token || @@eof_token,
306
+ nil, @position)
307
+ end
308
+ end
309
+ return @current_tokens
310
+ end
311
+
312
+ def inspect
313
+ "Lexer(#{@position.inspect})"
314
+ end
315
+
316
+ protected
317
+
318
+ def next_lexer(matchingString)
319
+ pos = @position + matchingString
320
+ #create_next_lexer(pos)
321
+ char_pos = pos.char_position
322
+ lexer = self.lexer_cache[char_pos]
323
+ self.lexer_cache[char_pos] = lexer = create_next_lexer(pos) unless lexer
324
+ lexer
325
+ end
326
+
327
+
328
+ def create_next_lexer(pos)
329
+ ReferencingRegexpLexer.new(self, pos)
330
+ end
331
+ end
332
+
333
+ class ReferencingRegexpLexer < ForkingRegexpLexer
334
+ def initialize(aForkingRegexpLexer, position)
335
+ @parent_lexer, @position = aForkingRegexpLexer, position
336
+ end
337
+
338
+ def inspect
339
+ "RefLexer(#{@position.inspect})"
340
+ end
341
+
342
+ def scanner
343
+ @parent_lexer.scanner
344
+ end
345
+
346
+ protected
347
+
348
+ def create_next_lexer(pos)
349
+ ReferencingRegexpLexer.new(@parent_lexer, pos)
350
+ end
351
+
352
+ def lexer_cache
353
+ @parent_lexer.lexer_cache
354
+ end
355
+
356
+ def eof_token
357
+ @parent_lexer.eof_token
358
+ end
359
+
360
+ def tokens
361
+ @parent_lexer.tokens
362
+ end
363
+ end
364
+
@@ -0,0 +1,3 @@
1
+ def rockit_version
2
+ "0.3.8"
3
+ end
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # AttributesParser -- Rpdf2txt -- 19.12.2002 -- hwyss@ywesee.com
24
+
25
+ require 'rpdf2txt-rockit/rockit'
26
+
27
+ module Rpdf2txt
28
+ GRAMMAR_PATH = File.expand_path('data/pdfattributes.grammar', File.dirname(__FILE__))
29
+ PARSER_PATH = File.expand_path('data/pdfattributes.rb', File.dirname(__FILE__))
30
+ def attributes_parser(grammar_path=GRAMMAR_PATH, parser_path=PARSER_PATH)
31
+ oldpath = File.dirname(grammar_path) << "/_" << File.basename(grammar_path)
32
+ src = File.read(grammar_path)
33
+ unless(File.exists?(oldpath) && File.read(oldpath)==src)
34
+ File.delete(oldpath) if File.exists?(oldpath)
35
+ Parse.generate_parser_from_file_to_file(grammar_path, parser_path, '_attr_parser', 'Rpdf2txt')
36
+ File.open(oldpath, 'w') { |f| f << src }
37
+ end
38
+ require parser_path
39
+ Rpdf2txt._attr_parser
40
+ end
41
+ module_function :attributes_parser
42
+ end
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # TextParser -- Rpdf2txt -- 04.11.2004 -- mwalder@ywesee.com
24
+ # rwaltert@ywesee.com
25
+
26
+ require 'rpdf2txt-rockit/rockit'
27
+
28
+ module Rpdf2txt
29
+ CMAP_GRAMMAR = File.expand_path('data/cmap.grammar',
30
+ File.dirname(__FILE__))
31
+ CMAP_PARSER = File.expand_path('data/cmap.rb',
32
+ File.dirname(__FILE__))
33
+ CMAP_RANGE_GRAMMAR = File.expand_path('data/cmap_range.grammar',
34
+ File.dirname(__FILE__))
35
+ CMAP_RANGE_PARSER = File.expand_path('data/cmap_range.rb',
36
+ File.dirname(__FILE__))
37
+ def Rpdf2txt.cmap_parser(grammar_path=CMAP_GRAMMAR,
38
+ parser_path=CMAP_PARSER)
39
+ oldpath = File.dirname(grammar_path) \
40
+ << "/_" << File.basename(grammar_path)
41
+ src = File.read(grammar_path)
42
+ unless(File.exists?(oldpath) && File.read(oldpath)==src)
43
+ File.delete(oldpath) if File.exists?(oldpath)
44
+ Parse.generate_parser_from_file_to_file(grammar_path,
45
+ parser_path, '_cmap_parser', 'Rpdf2txt')
46
+ File.open(oldpath, 'w') { |f| f << src }
47
+ end
48
+ require parser_path
49
+ Rpdf2txt._cmap_parser
50
+ end
51
+ def Rpdf2txt.cmap_range_parser(grammar_path=CMAP_RANGE_GRAMMAR,
52
+ parser_path=CMAP_RANGE_PARSER)
53
+ oldpath = File.dirname(grammar_path) \
54
+ << "/_" << File.basename(grammar_path)
55
+ src = File.read(grammar_path)
56
+ unless(File.exists?(oldpath) && File.read(oldpath)==src)
57
+ File.delete(oldpath) if File.exists?(oldpath)
58
+ Parse.generate_parser_from_file_to_file(grammar_path,
59
+ parser_path, '_cmap_range_parser', 'Rpdf2txt')
60
+ File.open(oldpath, 'w') { |f| f << src }
61
+ end
62
+ require parser_path
63
+ Rpdf2txt._cmap_range_parser
64
+ end
65
+ end