podoff 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.txt CHANGED
@@ -2,6 +2,13 @@
2
2
  = podoff CHANGELOG.txt
3
3
 
4
4
 
5
+ == podoff 1.1.0 released 2015-10-25
6
+
7
+ - more tolerant at parsing (StringScanner)
8
+ - bin/podoff
9
+ - Document#rewrite(path)
10
+
11
+
5
12
  == podoff 1.0.0 released 2015-10-23
6
13
 
7
14
  - leverage incremental updates
data/lib/podoff.rb CHANGED
@@ -23,10 +23,13 @@
23
23
  # Made in Japan.
24
24
  #++
25
25
 
26
+ require 'strscan'
27
+ require 'stringio'
28
+
26
29
 
27
30
  module Podoff
28
31
 
29
- VERSION = '1.0.0'
32
+ VERSION = '1.1.0'
30
33
 
31
34
  def self.load(path, encoding='iso-8859-1')
32
35
 
@@ -38,13 +41,6 @@ module Podoff
38
41
  Podoff::Document.new(s)
39
42
  end
40
43
 
41
- #OBJ_ATTRIBUTES =
42
- # { type: 'Type', subtype: 'Subtype',
43
- # parent: 'Parent', kids: 'Kids', contents: 'Contents', annots: 'Annots',
44
- # pagenum: 'pdftk_PageNum' }
45
- OBJ_ATTRIBUTES =
46
- { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' }
47
-
48
44
  class Document
49
45
 
50
46
  def self.load(path, encoding='iso-8859-1')
@@ -58,6 +54,7 @@ module Podoff
58
54
  end
59
55
 
60
56
  attr_reader :source
57
+ attr_reader :version
61
58
  attr_reader :xref
62
59
  attr_reader :objs
63
60
  attr_reader :obj_counters
@@ -68,9 +65,10 @@ module Podoff
68
65
  def initialize(s)
69
66
 
70
67
  fail ArgumentError.new('not a PDF file') \
71
- unless s.match(/\A%PDF-\d+\.\d+\n/)
68
+ unless s.match(/\A%PDF-\d+\.\d+\s/)
72
69
 
73
70
  @source = s
71
+ @version = nil
74
72
  @xref = nil
75
73
  @objs = {}
76
74
  @obj_counters = {}
@@ -78,47 +76,41 @@ module Podoff
78
76
 
79
77
  @additions = {}
80
78
 
81
- index = 0
82
- matches = {}
83
- #
84
- loop do
85
-
86
- matches[:obj] ||= s.match(/^(\d+ \d+) obj\b/, index)
87
- matches[:endobj] ||= s.match(/\bendobj\b/, index)
88
- #
89
- OBJ_ATTRIBUTES.each do |k, v|
90
- matches[k] ||= s.match(/\/#{v} (\/?[^\/\n<>]+)/, index)
91
- end
92
- #
93
- matches[:startxref] ||= s.match(/\bstartxref\s+(\d+)\s*%%EOF/, index)
94
-
95
- objm = matches[:obj]
96
- sxrm = matches[:startxref]
97
-
98
- break unless sxrm || objm
79
+ sca = ::StringScanner.new(s)
80
+ @version = sca.scan(/%PDF-\d+\.\d+/)
99
81
 
100
- fail ArgumentError.new('failed to find "startxref"') unless sxrm
82
+ loop do
101
83
 
102
- @root = nil if @root && index > @root.offset(0).last
103
- @root ||= s.match(/\/Root (\d+ \d+) R\b/, index)
84
+ i = sca.skip_until(
85
+ /(startxref\s+\d+|\d+\s+\d+\s+obj|\/Root\s+\d+\s+\d+\s+R)/)
104
86
 
105
- sxri = sxrm.offset(0).first
106
- obji = objm ? objm.offset(0).first : sxri + 1
87
+ m = sca.matched
88
+ break unless m
107
89
 
108
- if obji < sxri
109
- obj = Podoff::Obj.extract(self, matches)
90
+ if m[0] == 's'
91
+ @xref = m.split(' ').last.to_i
92
+ elsif m[0] == '/'
93
+ @root = extract_ref(m)
94
+ else
95
+ obj = Podoff::Obj.extract(self, sca)
110
96
  @objs[obj.ref] = obj
111
97
  @obj_counters[obj.ref] = (@obj_counters[obj.ref] || 0) + 1
112
- index = obj.end_index + 1
113
- else
114
- @xref = sxrm[1].to_i
115
- index = sxrm.offset(0).last + 1
116
- matches.delete(:startxref)
117
98
  end
118
99
  end
119
100
 
120
- fail ArgumentError.new('found no /Root') unless @root
121
- @root = @root[1]
101
+ if @root == nil
102
+ sca.pos = 0
103
+ loop do
104
+ i = sca.skip_until(/\/Root\s+\d+\s+\d+\s+R/)
105
+ break unless sca.matched
106
+ @root = extract_ref(sca.matched)
107
+ end
108
+ end
109
+ end
110
+
111
+ def extract_ref(s)
112
+
113
+ s.gsub(/\s+/, ' ').gsub(/[^0-9 ]+/, '').strip
122
114
  end
123
115
 
124
116
  def updated?
@@ -232,7 +224,12 @@ module Podoff
232
224
 
233
225
  def write(path)
234
226
 
235
- f = (path == :string) ? StringIO.new : File.open(path, 'wb')
227
+ f =
228
+ case path
229
+ when :string, '-' then StringIO.new
230
+ when String then File.open(path, 'wb')
231
+ else path
232
+ end
236
233
 
237
234
  f.write(@source)
238
235
 
@@ -274,9 +271,72 @@ module Podoff
274
271
  f.write("%%EOF\n")
275
272
  end
276
273
 
277
- f.close
274
+ f.close if path.is_a?(String) || path.is_a?(Symbol)
278
275
 
279
- path == :string ? f.string : nil
276
+ f.is_a?(StringIO) ? f.string : nil
277
+ end
278
+
279
+ def rewrite(path=:string)
280
+
281
+ f =
282
+ case path
283
+ when :string, '-' then StringIO.new
284
+ when String then File.open(path, 'wb')
285
+ else path
286
+ end
287
+
288
+ v = source.match(/%PDF-\d+\.\d+/)[0]
289
+ f.write(v)
290
+ f.write("\n")
291
+
292
+ ptrs = {}
293
+
294
+ objs.keys.sort.each do |k|
295
+ ptrs[k] = f.pos + 1
296
+ f.write(objs[k].source)
297
+ f.write("\n")
298
+ end
299
+
300
+ xref = f.pos + 1
301
+ max = objs.keys.inject(-1) { |i, k| [ i, k.split(' ')[0].to_i ].max }
302
+
303
+ #f.write("xref\n0 #{max}\n0000000000 65535 f\n")
304
+ f.write("xref\n0 1\n0000000000 65535 f\n")
305
+
306
+ partitions = [ [] ]
307
+ #
308
+ (1..max).each do |i|
309
+ k = "#{i} 0"
310
+ last = partitions.last
311
+ if ptrs.has_key?(k)
312
+ last << i
313
+ else
314
+ partitions << [] unless last == []
315
+ end
316
+ end
317
+ #
318
+ partitions.each do |part|
319
+
320
+ f.write("#{part.first} #{part.size}\n")
321
+
322
+ part.each do |i|
323
+ k = "#{i} 0"
324
+ #f.write(sprintf("%010d 00000 n %% %s\n", ptrs[k], k))
325
+ f.write(sprintf("%010d 00000 n\n", ptrs[k]))
326
+ end
327
+ end
328
+
329
+ f.write("trailer\n")
330
+ f.write("<<\n")
331
+ f.write("/Size #{objs.size}\n")
332
+ f.write("/Root #{root} R\n")
333
+ f.write(">>\n")
334
+ f.write("startxref #{xref}\n")
335
+ f.write("%%EOF\n")
336
+
337
+ f.close if path.is_a?(String) || path.is_a?(Symbol)
338
+
339
+ f.is_a?(StringIO) ? f.string : nil
280
340
  end
281
341
 
282
342
  private
@@ -292,24 +352,26 @@ module Podoff
292
352
 
293
353
  class Obj
294
354
 
295
- def self.extract(doc, matches)
355
+ ATTRIBUTES =
356
+ { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' }
296
357
 
297
- re = matches[:obj][1]
298
- st = matches[:obj].offset(0).first
299
- en = matches[:endobj].offset(0).last - 1
358
+ def self.extract(doc, sca)
300
359
 
301
- atts = {}
360
+ re = sca.matched[0..-4].strip
361
+ st = sca.pos - sca.matched.length
302
362
 
303
- OBJ_ATTRIBUTES.keys.each do |k|
304
- m = matches[k]
305
- if m && m.offset(0).last < en
306
- atts[k] = m[1].strip
307
- matches.delete(k)
308
- end
363
+ i = sca.skip_until(/endobj/); return nil unless i
364
+ en = sca.pos - 1
365
+
366
+ atts = {}
367
+ ATTRIBUTES.each do |k, v|
368
+ sca.pos = st
369
+ i = sca.skip_until(/\/#{v}\b/); next unless i
370
+ next if sca.pos > en
371
+ atts[k] = sca.scan(/ *\/?[^\n\r\/>]+/).strip
309
372
  end
310
373
 
311
- matches.delete(:obj)
312
- matches.delete(:endobj)
374
+ sca.pos = en
313
375
 
314
376
  Podoff::Obj.new(doc, re, st, en, atts)
315
377
  end
@@ -373,62 +435,6 @@ module Podoff
373
435
  r ? r.to_i : nil
374
436
  end
375
437
 
376
- # def parent
377
- #
378
- # r = @attributes[:parent]
379
- # r ? r[0..-2].strip : nil
380
- # end
381
- #
382
- # def kids
383
- #
384
- # r = @attributes[:kids]
385
- # (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
386
- # end
387
- #
388
- # def contents
389
- #
390
- # r = @attributes[:contents]
391
- # (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
392
- # end
393
-
394
- # def add_annotation(ref)
395
- #
396
- # if annots = @attributes[:annots]
397
- # fail "implement me!"
398
- # else
399
- # i = @source.index('/Type ')
400
- # @source.insert(i, "/Annots [#{ref} R]\n")
401
- # end
402
- # recompute_attributes
403
- # end
404
-
405
- # def add_free_text(x, y, text, font, size)
406
- #
407
- # fail ArgumentError.new('target is not a page') unless type == '/Page'
408
- #
409
- # nref = document.new_ref
410
- #
411
- # s = [
412
- # "#{nref} obj <<",
413
- # "/Type /Annot",
414
- # "/Subtype /FreeText",
415
- # "/Da (/F1 70 Tf 0 100 Td)",
416
- # "/Rect [0 0 500 600]",
417
- # "/Contents (#{text})",
418
- # ">>",
419
- # "endobj"
420
- # ].join("\n")
421
- # anno = Obj.create(document, nref, s)
422
- #
423
- # page = self.replicate
424
- # page.add_annotation(nref)
425
- #
426
- # document.add(anno)
427
- # document.add(page)
428
- #
429
- # anno
430
- # end
431
-
432
438
  def insert_font(nick, obj_or_ref)
433
439
 
434
440
  fail ArgumentError.new("target '#{ref}' not a replica") \
@@ -462,9 +468,9 @@ module Podoff
462
468
  def recompute_attributes
463
469
 
464
470
  @attributes =
465
- OBJ_ATTRIBUTES.inject({}) do |h, (k, v)|
466
- m = @source.match(/\/#{v} (\/?[^\/\n<>]+)/)
467
- h[k] = m[1] if m
471
+ ATTRIBUTES.inject({}) do |h, (k, v)|
472
+ m = @source.match(/\/#{v}\s+(\/?[^\/\n<>]+)/)
473
+ h[k] = m[1].strip if m
468
474
  h
469
475
  end
470
476
  end
@@ -481,7 +487,7 @@ module Podoff
481
487
 
482
488
  fail ArgumentError.new("obj not replicated") unless @source
483
489
 
484
- pkey = OBJ_ATTRIBUTES[key]
490
+ pkey = ATTRIBUTES[key]
485
491
 
486
492
  if v = @attributes[key]
487
493
  v = concat(v, ref)
@@ -504,10 +510,6 @@ module Podoff
504
510
  @content = StringIO.new
505
511
  end
506
512
 
507
- #def document; obj.document; end
508
- #def ref; obj.ref; end
509
- #def source; self; end
510
-
511
513
  def tf(font_name, font_size)
512
514
 
513
515
  n = font_name[0] == '/' ? font_name[1..-1] : font_name
data/spec/core_spec.rb CHANGED
@@ -17,8 +17,8 @@ describe Podoff do
17
17
  d = Podoff.load('pdfs/t0.pdf')
18
18
 
19
19
  expect(d.class).to eq(Podoff::Document)
20
- expect(d.xref).to eq(414)
21
20
  expect(d.objs.keys).to eq([ '1 0', '2 0', '3 0', '4 0', '5 0', '6 0' ])
21
+ expect(d.xref).to eq(414)
22
22
 
23
23
  #pp d.objs.values.collect(&:to_a)
24
24
 
@@ -35,6 +35,8 @@ describe Podoff do
35
35
  [ 1, 1, 1, 1, 1, 1 ])
36
36
 
37
37
  expect(d.root).to eq('1 0')
38
+
39
+ expect(d.pages.size).to eq(1)
38
40
  end
39
41
 
40
42
  it 'loads a PDF document' do
@@ -48,6 +50,8 @@ describe Podoff do
48
50
  expect(d.objs.keys).to include('273 0')
49
51
 
50
52
  expect(d.root).to eq('65 0')
53
+
54
+ expect(d.pages.size).to eq(3)
51
55
  end
52
56
 
53
57
  it 'loads a PDF document with incremental updates' do
@@ -66,6 +70,25 @@ describe Podoff do
66
70
  expect(d.root).to eq('1 0')
67
71
  end
68
72
 
73
+ it 'loads a [re]compressed PDF documents' do
74
+
75
+ d = Podoff.load('pdfs/qdocument0.pdf')
76
+
77
+ expect(d.class).to eq(Podoff::Document)
78
+ expect(d.xref).to eq(1612815)
79
+ expect(d.objs.size).to eq(273)
80
+
81
+ expect(d.root).to eq('1 0')
82
+
83
+ #d.objs.each do |ref, o|
84
+ # p [ o.ref, o.attributes ]
85
+ #end
86
+
87
+ expect(d.pages.size).to eq(3)
88
+ expect(d.pages.first.attributes[:pagenum]).to eq('1')
89
+ expect(d.objs['46 0'].attributes[:type]).to eq('/Annot')
90
+ end
91
+
69
92
  it 'rejects items that are not PDF documents' do
70
93
 
71
94
  expect {
@@ -77,43 +77,6 @@ describe Podoff::Document do
77
77
  end
78
78
  end
79
79
 
80
- describe '#write' do
81
-
82
- it 'writes the document to a given path' do
83
-
84
- @d.write('tmp/out.pdf')
85
-
86
- s = File.open('tmp/out.pdf', 'r:iso8859-1') { |f| f.read }
87
- lines = s.split("\n")
88
-
89
- expect(lines.first).to match(/^%PDF-1.7$/)
90
- expect(lines.last).to match(/^%%EOF$/)
91
- end
92
-
93
- it 'writes open streams as well' do
94
-
95
- d = Podoff.load('pdfs/t0.pdf')
96
-
97
- pa = d.re_add(d.page(1))
98
- st = d.add_stream
99
- st.bt(10, 20, 'hello open stream')
100
- pa.insert_contents(st)
101
-
102
- s = d.write(:string)
103
-
104
- expect(
105
- d.write(:string).index(%{
106
- 7 0 obj
107
- << /Length 37 >>
108
- stream
109
- BT 10 20 Td (hello open stream) Tj ET
110
- endstream
111
- endobj
112
- }.strip)
113
- ).to eq(722)
114
- end
115
- end
116
-
117
80
  describe '#dup' do
118
81
 
119
82
  it 'produces a shallow copy of the document' do
@@ -272,6 +235,107 @@ BT /Helvetica 35 Tf 40 50 Td (sixty there) Tj ET
272
235
  expect(re.source).to eq(pa.source)
273
236
  expect(re.source).not_to equal(pa.source)
274
237
  end
238
+
239
+ it 'recomputes the attributes correctly' do
240
+
241
+ d = Podoff.load('pdfs/qdocument0.pdf')
242
+
243
+ pa = d.re_add(d.page(1))
244
+
245
+ expect(pa.attributes).to eq(
246
+ { type: '/Page', contents: '151 0 R', pagenum: '1' })
247
+ end
248
+ end
249
+ end
250
+
251
+ describe '#write' do
252
+
253
+ it 'writes the document to a given path' do
254
+
255
+ @d.write('tmp/out.pdf')
256
+
257
+ s = File.open('tmp/out.pdf', 'r:iso8859-1') { |f| f.read }
258
+ lines = s.split("\n")
259
+
260
+ expect(lines.first).to match(/^%PDF-1.7$/)
261
+ expect(lines.last).to match(/^%%EOF$/)
262
+ end
263
+
264
+ it 'writes open streams as well' do
265
+
266
+ d = Podoff.load('pdfs/t0.pdf')
267
+
268
+ pa = d.re_add(d.page(1))
269
+ st = d.add_stream
270
+ st.bt(10, 20, 'hello open stream')
271
+ pa.insert_contents(st)
272
+
273
+ s = d.write(:string)
274
+
275
+ expect(
276
+ d.write(:string).index(%{
277
+ 7 0 obj
278
+ << /Length 37 >>
279
+ stream
280
+ BT 10 20 Td (hello open stream) Tj ET
281
+ endstream
282
+ endobj
283
+ }.strip)
284
+ ).to eq(722)
285
+ end
286
+ end
287
+
288
+ describe '#rewrite' do
289
+
290
+ it 'rewrites a document in one go' do
291
+
292
+ d = Podoff.load('pdfs/t2.pdf')
293
+
294
+ s = d.rewrite(:string)
295
+
296
+ expect(s.strip).to eq(%{
297
+ %PDF-1.4
298
+ 1 0 obj <</Type /Catalog /Pages 2 0 R>>
299
+ endobj
300
+ 2 0 obj <</Type /Pages /Kids [3 0 R] /Count 1>>
301
+ endobj
302
+ 3 0 obj <</Type /Page /Parent 2 0 R /Resources 4 0 R /MediaBox [0 0 500 800] /Contents [6 0 R 7 0 R]>>
303
+ endobj
304
+ 4 0 obj <</Font <</F1 5 0 R>>>>
305
+ endobj
306
+ 5 0 obj <</Type /Font /Subtype /Type1 /BaseFont /Helvetica>>
307
+ endobj
308
+ 6 0 obj
309
+ <</Length 44>>
310
+ stream
311
+ BT /F1 24 Tf 175 720 Td (Hello Nadaa!)Tj ET
312
+ endstream
313
+ endobj
314
+ 7 0 obj
315
+ <</Length 44>>
316
+ stream
317
+ BT /F1 24 Tf 175 520 Td (Smurf Megane)Tj ET
318
+ endstream
319
+ endobj
320
+ xref
321
+ 0 1
322
+ 0000000000 65535 f
323
+ 1 7
324
+ 0000000010 00000 n
325
+ 0000000057 00000 n
326
+ 0000000112 00000 n
327
+ 0000000222 00000 n
328
+ 0000000261 00000 n
329
+ 0000000329 00000 n
330
+ 0000000420 00000 n
331
+ trailer
332
+ <<
333
+ /Size 7
334
+ /Root 1 0 R
335
+ >>
336
+ startxref 511
337
+ %%EOF
338
+ }.strip)
275
339
  end
276
340
  end
277
341
  end
data/spec/obj_spec.rb CHANGED
@@ -232,6 +232,21 @@ endobj
232
232
  expect(pa.source).to match(/\/Font\s+<<\s+\/MyHelv #{fo.ref} R\s+/)
233
233
  end
234
234
  end
235
+
236
+ describe '#add_to_attribute' do
237
+
238
+ it 'adds to a list of references' do
239
+
240
+ d = Podoff.load('pdfs/qdocument0.pdf')
241
+
242
+ o = d.re_add('56 0')
243
+
244
+ o.send(:add_to_attribute, :contents, '9999 0')
245
+
246
+ expect(o.attributes).to eq(
247
+ { type: '/Page', contents: '[151 0 R 9999 0 R]', pagenum: '1' })
248
+ end
249
+ end
235
250
  end
236
251
  end
237
252
 
data/todo.txt CHANGED
@@ -1,13 +1,21 @@
1
1
 
2
- [ ] stop using the st-mark thing
3
- [ ] doc.add_stream { |st| st.bt ... }
4
-
2
+ [o] implement Document#rewrite
3
+ [o] doc.add_stream { |st| st.bt ... }
5
4
  [o] st = d.addstream
6
5
  st.tf '/helv', 12
7
6
  st.bt 5, 6, "hello"
8
7
  d.write('out.pdf') # closes the stream...
9
8
  [ ] fail if insert_content or insert_font on an unclosed stream obj
10
9
 
10
+ [o] Document.write('-') (stdout inspiration)
11
+
12
+
13
+ [ ] stop using the st-mark thing
11
14
  [ ] recompress idea? uncompress with pdftk, recompress with podoff
12
- smaller docs anyway...
15
+ smaller docs anyway... /Filter /FlatDecode
16
+
17
+ decompress:
18
+ qpdf --qdf --object-streams=disable in.pdf out.pdf
19
+ recompress:
20
+ qpdf in.pdf out.pdf
13
21
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: podoff
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-10-23 00:00:00.000000000 Z
12
+ date: 2015-10-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake