rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,364 @@
1
+ require 'rpdf2txt-rockit/syntax_tree'
2
+ require 'rpdf2txt-rockit/sourcecode_dumpable'
3
+ require 'rpdf2txt-rockit/bounded_lru_cache'
4
+
5
+ class TokenRegexp < Regexp
6
+ def initialize(aStringOrRegexp)
7
+ if aStringOrRegexp.class == String
8
+ @string = aStringOrRegexp
9
+ @string = "^(" + @string + ")" unless @string[0,1] == "^"
10
+ super @string
11
+ elsif aStringOrRegexp.kind_of?(Regexp)
12
+ @string = aStringOrRegexp.source
13
+ @string = "^(" + @string + ")" unless @string[0,1] == "^"
14
+ super(@string, aStringOrRegexp.options)
15
+ else
16
+ raise ArgumentError
17
+ end
18
+ end
19
+
20
+ def string
21
+ if @string =~ /\^\(.*\)/n
22
+ @string[2...-1]
23
+ else
24
+ @string
25
+ end
26
+ end
27
+ end
28
+
29
+ # Short hands for composing token regexp's
30
+ def tr(aStringOrRegexp)
31
+ aStringOrRegexp = aStringOrRegexp.source if aStringOrRegexp.class == Regexp
32
+ TokenRegexp.new(aStringOrRegexp)
33
+ end
34
+
35
+ def tre_compose(tokens, map, separator)
36
+ str = (map % tokens[0].string)
37
+ tokens[1..-1].each {|token| str += separator + (map % token.string)}
38
+ tr(str)
39
+ end
40
+
41
+ def ror(*tokens)
42
+ tre_compose(tokens, "(%s)", "|")
43
+ end
44
+
45
+ def rseq(*tokens)
46
+ tre_compose(tokens, "(%s)", "")
47
+ end
48
+
49
+ def r?(tokenregexp)
50
+ tr("(%s)?" % tokenregexp.string)
51
+ end
52
+
53
+ def rm(tokenregexp)
54
+ tr("(%s)*" % tokenregexp.string)
55
+ end
56
+
57
+ def rp(tokenregexp)
58
+ tr("(%s)+" % tokenregexp.string)
59
+ end
60
+
61
+ class Token
62
+ include SourceCodeDumpable
63
+ attr_reader :skip, :regexp
64
+ attr_accessor :name
65
+
66
+ def initialize(aString, aStringOrRegexpOrTokenRegexp = "", *options)
67
+ @name, @regexp = aString, TokenRegexp.new(aStringOrRegexpOrTokenRegexp)
68
+ parse_options(options)
69
+ end
70
+
71
+ def hash
72
+ @hashvalue || (@hashvalue = [self.class, @name, @regexp, @skip].hash)
73
+ end
74
+
75
+ def parse_options(options)
76
+ option_names = options.map do |option|
77
+ if option.kind_of? Symbol
78
+ option.id2name.downcase
79
+ else
80
+ option.downcase
81
+ end
82
+ end
83
+ @skip = true if option_names.include? "skip"
84
+ end
85
+
86
+ def ==(other)
87
+ other.class == self.class and
88
+ other.name == name and
89
+ other.regexp.inspect == regexp.inspect and
90
+ other.skip == skip
91
+ end
92
+
93
+ def match(aString)
94
+ @regexp.match aString
95
+ end
96
+
97
+ def value(lexeme)
98
+ # TODO: Add blocks that map lexeme's to values.
99
+ lexeme
100
+ end
101
+
102
+ def create_tree(lexeme, position)
103
+ t = SyntaxTree.new(@name, ["lexeme", "value"], [value(lexeme), lexeme])
104
+ t.attributes[:position] = position
105
+ t
106
+ end
107
+
108
+ def to_src(assignToName = nil, nameHash = {})
109
+ if skip
110
+ assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src), :Skip))
111
+ else
112
+ assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src)))
113
+ end
114
+ end
115
+
116
+ def inspect
117
+ name || regexp.inspect
118
+ #osrc = options_to_src
119
+ #"#{name} = #{regexp.inspect} #{osrc.length>0 ? '['+osrc+']' : ''}"
120
+ end
121
+
122
+ protected
123
+
124
+ def options_to_src
125
+ if skip
126
+ ":Skip"
127
+ else
128
+ ""
129
+ end
130
+ end
131
+ end
132
+
133
+ class EofToken < Token
134
+ def initialize(*args)
135
+ # Shouldn't match anything but since I'm not sure how to do a regexp
136
+ # with that chareacteristic we use a highly unlikely string in the mean
137
+ # time.
138
+ super("EOF", "�~~��~^^~" + rand(1e10).inspect)
139
+ end
140
+
141
+ def ==(other)
142
+ other.class == self.class
143
+ end
144
+ end
145
+
146
+ class EpsilonToken < Token
147
+ def initialize
148
+ # Shouldn't match anything but since I'm not sure how to do a regexp
149
+ # with that chareacteristic we use a highly unlikely string in the mean
150
+ # time.
151
+ super("epsilon", "�~~��~^^~" + rand(1e10).inspect)
152
+ end
153
+
154
+ def ==(other)
155
+ other.class == self.class
156
+ end
157
+ end
158
+
159
+ class StringToken < Token
160
+ def initialize(name, string = name)
161
+ @string = string
162
+ super(name, Regexp.escape(string))
163
+ end
164
+
165
+ def to_src(assignToName = nil, nameHash = {})
166
+ assign_to(assignToName, new_of_my_type(name, @string))
167
+ end
168
+
169
+ def hash
170
+ @hashvalue || (@hashvalue = [self.class, @name, @string].hash)
171
+ end
172
+
173
+ def to_s
174
+ "#{id} #{@string} #{name.inspect} #{hash}"
175
+ end
176
+
177
+ def inspect
178
+ @string.inspect
179
+ end
180
+ end
181
+
182
+ def string_token(string)
183
+ StringToken.new("StrToken" + string.hash.inspect, string)
184
+ end
185
+
186
+ class RegexpToken < Token
187
+ def initialize(aString, regexp, *options)
188
+ @name, @regexp = aString, regexp
189
+ parse_options(options)
190
+ end
191
+ end
192
+
193
+ def regexp_token(regexp, *options)
194
+ RegexpToken.new("RegexpToken" + regexp.hash.inspect, regexp, *options)
195
+ end
196
+
197
+ def t(name, re, *options)
198
+ if re.class == String
199
+ StringToken.new("StrToken" + re.hash.inspect, re)
200
+ else
201
+ Token.new(name, re, *options)
202
+ end
203
+ end
204
+
205
+ require 'rpdf2txt-rockit/stringscanner' # DO *NOT* alter since install.rb exploits formatting
206
+
207
+ # Forking lexers return LexerToken's with the info about a matching token
208
+ # and the lexer to access for next tokens.
209
+ class LexerToken
210
+ attr_reader :lexeme, :token_type, :lexer, :position
211
+
212
+ def initialize(lexeme, tokenType, lexer, position = nil)
213
+ @lexeme, @token_type, @lexer = lexeme, tokenType, lexer
214
+ @position = position
215
+ end
216
+
217
+ def create_tree
218
+ @token_type.create_tree(@lexeme, @position)
219
+ end
220
+
221
+ def inspect
222
+ "LT(#{lexeme.inspect}, #{token_type.name})"
223
+ end
224
+ end
225
+
226
+ class LexerPosition
227
+ attr_reader :row, :column, :char_position
228
+
229
+ def initialize(row = 0, column = 0, char_position = 0)
230
+ @row, @column, @char_position = row, column, char_position
231
+ end
232
+
233
+ def +(aString)
234
+ char_position = @char_position + aString.length
235
+ num_newlines = aString.count "\r\n"
236
+ row = @row
237
+ if num_newlines == 0
238
+ column = @column + aString.length
239
+ else
240
+ row += num_newlines
241
+ begin
242
+ column = aString.split("\n").last.split("\r").last.length
243
+ rescue NameError
244
+ column = 0
245
+ end
246
+ end
247
+ LexerPosition.new(row, column, char_position)
248
+ end
249
+
250
+ def inspect
251
+ "(row=#{row},column=#{@column})"
252
+ end
253
+ end
254
+
255
+ # NOTE: If more performance is needed it might be good to use one char of
256
+ # lookahead to group tokens and reduce the number of tokens that needs to
257
+ # be tested.
258
+ class ForkingRegexpLexer
259
+ attr_accessor :position
260
+ attr_reader :scanner, :tokens, :lexer_cache, :eof_token
261
+ protected :lexer_cache
262
+
263
+ def initialize(tokens, eofToken = nil)
264
+ @tokens = tokens
265
+ @eof_token = tokens.detect {|t| t.kind_of?(EofToken)}
266
+ @tokens.delete_if {|t| t.kind_of?(EofToken)}
267
+ end
268
+
269
+ @@eof_token = EofToken.new
270
+
271
+ def init(aString)
272
+ @position, @current_tokens = LexerPosition.new, nil
273
+ @scanner = StringScanner.new(aString)
274
+
275
+ # We speed things up by only having one lexer at each position. Since there
276
+ # are typically only a small number of positions we use a BoundedLruCache
277
+ # of size 20 to keep them in. The cache throws out oldest (least recently
278
+ # used, NOTE! accessed in the cache not used in the parser) lexer when
279
+ # new one inserted. This is to keep the memory consumption down.
280
+ #
281
+ @lexer_cache = BoundedLruCache.new(20)
282
+ end
283
+
284
+ # Refactor! Complex interactions when tokens are skipped since the next_lexer
285
+ # update "our" scanner. Find cleaner way of expressing this!
286
+ def peek
287
+ return @current_tokens if @current_tokens
288
+ scanner.pointer = @position.char_position
289
+ @current_tokens = Array.new
290
+ tokens.each do |token|
291
+ if (match = scanner.check(token.regexp))
292
+ if token.skip
293
+ # Token to be skipped => return tokens matching after the skipped one
294
+ @current_tokens.concat next_lexer(match).peek
295
+ scanner.pointer = @position.char_position
296
+ else
297
+ @current_tokens.push LexerToken.new(match, token,
298
+ next_lexer(match), @position)
299
+ end
300
+ end
301
+ end
302
+ if @current_tokens.length == 0
303
+ @string_length = scanner.string.length unless @string_length
304
+ if @position.char_position >= @string_length
305
+ @current_tokens.push LexerToken.new(nil, eof_token || @@eof_token,
306
+ nil, @position)
307
+ end
308
+ end
309
+ return @current_tokens
310
+ end
311
+
312
+ def inspect
313
+ "Lexer(#{@position.inspect})"
314
+ end
315
+
316
+ protected
317
+
318
+ def next_lexer(matchingString)
319
+ pos = @position + matchingString
320
+ #create_next_lexer(pos)
321
+ char_pos = pos.char_position
322
+ lexer = self.lexer_cache[char_pos]
323
+ self.lexer_cache[char_pos] = lexer = create_next_lexer(pos) unless lexer
324
+ lexer
325
+ end
326
+
327
+
328
+ def create_next_lexer(pos)
329
+ ReferencingRegexpLexer.new(self, pos)
330
+ end
331
+ end
332
+
333
+ class ReferencingRegexpLexer < ForkingRegexpLexer
334
+ def initialize(aForkingRegexpLexer, position)
335
+ @parent_lexer, @position = aForkingRegexpLexer, position
336
+ end
337
+
338
+ def inspect
339
+ "RefLexer(#{@position.inspect})"
340
+ end
341
+
342
+ def scanner
343
+ @parent_lexer.scanner
344
+ end
345
+
346
+ protected
347
+
348
+ def create_next_lexer(pos)
349
+ ReferencingRegexpLexer.new(@parent_lexer, pos)
350
+ end
351
+
352
+ def lexer_cache
353
+ @parent_lexer.lexer_cache
354
+ end
355
+
356
+ def eof_token
357
+ @parent_lexer.eof_token
358
+ end
359
+
360
+ def tokens
361
+ @parent_lexer.tokens
362
+ end
363
+ end
364
+
@@ -0,0 +1,3 @@
1
+ def rockit_version
2
+ "0.3.8"
3
+ end
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # AttributesParser -- Rpdf2txt -- 19.12.2002 -- hwyss@ywesee.com
24
+
25
+ require 'rpdf2txt-rockit/rockit'
26
+
27
+ module Rpdf2txt
28
+ GRAMMAR_PATH = File.expand_path('data/pdfattributes.grammar', File.dirname(__FILE__))
29
+ PARSER_PATH = File.expand_path('data/pdfattributes.rb', File.dirname(__FILE__))
30
+ def attributes_parser(grammar_path=GRAMMAR_PATH, parser_path=PARSER_PATH)
31
+ oldpath = File.dirname(grammar_path) << "/_" << File.basename(grammar_path)
32
+ src = File.read(grammar_path)
33
+ unless(File.exists?(oldpath) && File.read(oldpath)==src)
34
+ File.delete(oldpath) if File.exists?(oldpath)
35
+ Parse.generate_parser_from_file_to_file(grammar_path, parser_path, '_attr_parser', 'Rpdf2txt')
36
+ File.open(oldpath, 'w') { |f| f << src }
37
+ end
38
+ require parser_path
39
+ Rpdf2txt._attr_parser
40
+ end
41
+ module_function :attributes_parser
42
+ end
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # TextParser -- Rpdf2txt -- 04.11.2004 -- mwalder@ywesee.com
24
+ # rwaltert@ywesee.com
25
+
26
+ require 'rpdf2txt-rockit/rockit'
27
+
28
+ module Rpdf2txt
29
+ CMAP_GRAMMAR = File.expand_path('data/cmap.grammar',
30
+ File.dirname(__FILE__))
31
+ CMAP_PARSER = File.expand_path('data/cmap.rb',
32
+ File.dirname(__FILE__))
33
+ CMAP_RANGE_GRAMMAR = File.expand_path('data/cmap_range.grammar',
34
+ File.dirname(__FILE__))
35
+ CMAP_RANGE_PARSER = File.expand_path('data/cmap_range.rb',
36
+ File.dirname(__FILE__))
37
+ def Rpdf2txt.cmap_parser(grammar_path=CMAP_GRAMMAR,
38
+ parser_path=CMAP_PARSER)
39
+ oldpath = File.dirname(grammar_path) \
40
+ << "/_" << File.basename(grammar_path)
41
+ src = File.read(grammar_path)
42
+ unless(File.exists?(oldpath) && File.read(oldpath)==src)
43
+ File.delete(oldpath) if File.exists?(oldpath)
44
+ Parse.generate_parser_from_file_to_file(grammar_path,
45
+ parser_path, '_cmap_parser', 'Rpdf2txt')
46
+ File.open(oldpath, 'w') { |f| f << src }
47
+ end
48
+ require parser_path
49
+ Rpdf2txt._cmap_parser
50
+ end
51
+ def Rpdf2txt.cmap_range_parser(grammar_path=CMAP_RANGE_GRAMMAR,
52
+ parser_path=CMAP_RANGE_PARSER)
53
+ oldpath = File.dirname(grammar_path) \
54
+ << "/_" << File.basename(grammar_path)
55
+ src = File.read(grammar_path)
56
+ unless(File.exists?(oldpath) && File.read(oldpath)==src)
57
+ File.delete(oldpath) if File.exists?(oldpath)
58
+ Parse.generate_parser_from_file_to_file(grammar_path,
59
+ parser_path, '_cmap_range_parser', 'Rpdf2txt')
60
+ File.open(oldpath, 'w') { |f| f << src }
61
+ end
62
+ require parser_path
63
+ Rpdf2txt._cmap_range_parser
64
+ end
65
+ end