podoff 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.txt CHANGED
@@ -2,6 +2,13 @@
2
2
  = podoff CHANGELOG.txt
3
3
 
4
4
 
5
+ == podoff 1.1.0 released 2015-10-25
6
+
7
+ - more tolerant at parsing (StringScanner)
8
+ - bin/podoff
9
+ - Document#rewrite(path)
10
+
11
+
5
12
  == podoff 1.0.0 released 2015-10-23
6
13
 
7
14
  - leverage incremental updates
data/lib/podoff.rb CHANGED
@@ -23,10 +23,13 @@
23
23
  # Made in Japan.
24
24
  #++
25
25
 
26
+ require 'strscan'
27
+ require 'stringio'
28
+
26
29
 
27
30
  module Podoff
28
31
 
29
- VERSION = '1.0.0'
32
+ VERSION = '1.1.0'
30
33
 
31
34
  def self.load(path, encoding='iso-8859-1')
32
35
 
@@ -38,13 +41,6 @@ module Podoff
38
41
  Podoff::Document.new(s)
39
42
  end
40
43
 
41
- #OBJ_ATTRIBUTES =
42
- # { type: 'Type', subtype: 'Subtype',
43
- # parent: 'Parent', kids: 'Kids', contents: 'Contents', annots: 'Annots',
44
- # pagenum: 'pdftk_PageNum' }
45
- OBJ_ATTRIBUTES =
46
- { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' }
47
-
48
44
  class Document
49
45
 
50
46
  def self.load(path, encoding='iso-8859-1')
@@ -58,6 +54,7 @@ module Podoff
58
54
  end
59
55
 
60
56
  attr_reader :source
57
+ attr_reader :version
61
58
  attr_reader :xref
62
59
  attr_reader :objs
63
60
  attr_reader :obj_counters
@@ -68,9 +65,10 @@ module Podoff
68
65
  def initialize(s)
69
66
 
70
67
  fail ArgumentError.new('not a PDF file') \
71
- unless s.match(/\A%PDF-\d+\.\d+\n/)
68
+ unless s.match(/\A%PDF-\d+\.\d+\s/)
72
69
 
73
70
  @source = s
71
+ @version = nil
74
72
  @xref = nil
75
73
  @objs = {}
76
74
  @obj_counters = {}
@@ -78,47 +76,41 @@ module Podoff
78
76
 
79
77
  @additions = {}
80
78
 
81
- index = 0
82
- matches = {}
83
- #
84
- loop do
85
-
86
- matches[:obj] ||= s.match(/^(\d+ \d+) obj\b/, index)
87
- matches[:endobj] ||= s.match(/\bendobj\b/, index)
88
- #
89
- OBJ_ATTRIBUTES.each do |k, v|
90
- matches[k] ||= s.match(/\/#{v} (\/?[^\/\n<>]+)/, index)
91
- end
92
- #
93
- matches[:startxref] ||= s.match(/\bstartxref\s+(\d+)\s*%%EOF/, index)
94
-
95
- objm = matches[:obj]
96
- sxrm = matches[:startxref]
97
-
98
- break unless sxrm || objm
79
+ sca = ::StringScanner.new(s)
80
+ @version = sca.scan(/%PDF-\d+\.\d+/)
99
81
 
100
- fail ArgumentError.new('failed to find "startxref"') unless sxrm
82
+ loop do
101
83
 
102
- @root = nil if @root && index > @root.offset(0).last
103
- @root ||= s.match(/\/Root (\d+ \d+) R\b/, index)
84
+ i = sca.skip_until(
85
+ /(startxref\s+\d+|\d+\s+\d+\s+obj|\/Root\s+\d+\s+\d+\s+R)/)
104
86
 
105
- sxri = sxrm.offset(0).first
106
- obji = objm ? objm.offset(0).first : sxri + 1
87
+ m = sca.matched
88
+ break unless m
107
89
 
108
- if obji < sxri
109
- obj = Podoff::Obj.extract(self, matches)
90
+ if m[0] == 's'
91
+ @xref = m.split(' ').last.to_i
92
+ elsif m[0] == '/'
93
+ @root = extract_ref(m)
94
+ else
95
+ obj = Podoff::Obj.extract(self, sca)
110
96
  @objs[obj.ref] = obj
111
97
  @obj_counters[obj.ref] = (@obj_counters[obj.ref] || 0) + 1
112
- index = obj.end_index + 1
113
- else
114
- @xref = sxrm[1].to_i
115
- index = sxrm.offset(0).last + 1
116
- matches.delete(:startxref)
117
98
  end
118
99
  end
119
100
 
120
- fail ArgumentError.new('found no /Root') unless @root
121
- @root = @root[1]
101
+ if @root == nil
102
+ sca.pos = 0
103
+ loop do
104
+ i = sca.skip_until(/\/Root\s+\d+\s+\d+\s+R/)
105
+ break unless sca.matched
106
+ @root = extract_ref(sca.matched)
107
+ end
108
+ end
109
+ end
110
+
111
+ def extract_ref(s)
112
+
113
+ s.gsub(/\s+/, ' ').gsub(/[^0-9 ]+/, '').strip
122
114
  end
123
115
 
124
116
  def updated?
@@ -232,7 +224,12 @@ module Podoff
232
224
 
233
225
  def write(path)
234
226
 
235
- f = (path == :string) ? StringIO.new : File.open(path, 'wb')
227
+ f =
228
+ case path
229
+ when :string, '-' then StringIO.new
230
+ when String then File.open(path, 'wb')
231
+ else path
232
+ end
236
233
 
237
234
  f.write(@source)
238
235
 
@@ -274,9 +271,72 @@ module Podoff
274
271
  f.write("%%EOF\n")
275
272
  end
276
273
 
277
- f.close
274
+ f.close if path.is_a?(String) || path.is_a?(Symbol)
278
275
 
279
- path == :string ? f.string : nil
276
+ f.is_a?(StringIO) ? f.string : nil
277
+ end
278
+
279
+ def rewrite(path=:string)
280
+
281
+ f =
282
+ case path
283
+ when :string, '-' then StringIO.new
284
+ when String then File.open(path, 'wb')
285
+ else path
286
+ end
287
+
288
+ v = source.match(/%PDF-\d+\.\d+/)[0]
289
+ f.write(v)
290
+ f.write("\n")
291
+
292
+ ptrs = {}
293
+
294
+ objs.keys.sort.each do |k|
295
+ ptrs[k] = f.pos + 1
296
+ f.write(objs[k].source)
297
+ f.write("\n")
298
+ end
299
+
300
+ xref = f.pos + 1
301
+ max = objs.keys.inject(-1) { |i, k| [ i, k.split(' ')[0].to_i ].max }
302
+
303
+ #f.write("xref\n0 #{max}\n0000000000 65535 f\n")
304
+ f.write("xref\n0 1\n0000000000 65535 f\n")
305
+
306
+ partitions = [ [] ]
307
+ #
308
+ (1..max).each do |i|
309
+ k = "#{i} 0"
310
+ last = partitions.last
311
+ if ptrs.has_key?(k)
312
+ last << i
313
+ else
314
+ partitions << [] unless last == []
315
+ end
316
+ end
317
+ #
318
+ partitions.each do |part|
319
+
320
+ f.write("#{part.first} #{part.size}\n")
321
+
322
+ part.each do |i|
323
+ k = "#{i} 0"
324
+ #f.write(sprintf("%010d 00000 n %% %s\n", ptrs[k], k))
325
+ f.write(sprintf("%010d 00000 n\n", ptrs[k]))
326
+ end
327
+ end
328
+
329
+ f.write("trailer\n")
330
+ f.write("<<\n")
331
+ f.write("/Size #{objs.size}\n")
332
+ f.write("/Root #{root} R\n")
333
+ f.write(">>\n")
334
+ f.write("startxref #{xref}\n")
335
+ f.write("%%EOF\n")
336
+
337
+ f.close if path.is_a?(String) || path.is_a?(Symbol)
338
+
339
+ f.is_a?(StringIO) ? f.string : nil
280
340
  end
281
341
 
282
342
  private
@@ -292,24 +352,26 @@ module Podoff
292
352
 
293
353
  class Obj
294
354
 
295
- def self.extract(doc, matches)
355
+ ATTRIBUTES =
356
+ { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' }
296
357
 
297
- re = matches[:obj][1]
298
- st = matches[:obj].offset(0).first
299
- en = matches[:endobj].offset(0).last - 1
358
+ def self.extract(doc, sca)
300
359
 
301
- atts = {}
360
+ re = sca.matched[0..-4].strip
361
+ st = sca.pos - sca.matched.length
302
362
 
303
- OBJ_ATTRIBUTES.keys.each do |k|
304
- m = matches[k]
305
- if m && m.offset(0).last < en
306
- atts[k] = m[1].strip
307
- matches.delete(k)
308
- end
363
+ i = sca.skip_until(/endobj/); return nil unless i
364
+ en = sca.pos - 1
365
+
366
+ atts = {}
367
+ ATTRIBUTES.each do |k, v|
368
+ sca.pos = st
369
+ i = sca.skip_until(/\/#{v}\b/); next unless i
370
+ next if sca.pos > en
371
+ atts[k] = sca.scan(/ *\/?[^\n\r\/>]+/).strip
309
372
  end
310
373
 
311
- matches.delete(:obj)
312
- matches.delete(:endobj)
374
+ sca.pos = en
313
375
 
314
376
  Podoff::Obj.new(doc, re, st, en, atts)
315
377
  end
@@ -373,62 +435,6 @@ module Podoff
373
435
  r ? r.to_i : nil
374
436
  end
375
437
 
376
- # def parent
377
- #
378
- # r = @attributes[:parent]
379
- # r ? r[0..-2].strip : nil
380
- # end
381
- #
382
- # def kids
383
- #
384
- # r = @attributes[:kids]
385
- # (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
386
- # end
387
- #
388
- # def contents
389
- #
390
- # r = @attributes[:contents]
391
- # (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
392
- # end
393
-
394
- # def add_annotation(ref)
395
- #
396
- # if annots = @attributes[:annots]
397
- # fail "implement me!"
398
- # else
399
- # i = @source.index('/Type ')
400
- # @source.insert(i, "/Annots [#{ref} R]\n")
401
- # end
402
- # recompute_attributes
403
- # end
404
-
405
- # def add_free_text(x, y, text, font, size)
406
- #
407
- # fail ArgumentError.new('target is not a page') unless type == '/Page'
408
- #
409
- # nref = document.new_ref
410
- #
411
- # s = [
412
- # "#{nref} obj <<",
413
- # "/Type /Annot",
414
- # "/Subtype /FreeText",
415
- # "/Da (/F1 70 Tf 0 100 Td)",
416
- # "/Rect [0 0 500 600]",
417
- # "/Contents (#{text})",
418
- # ">>",
419
- # "endobj"
420
- # ].join("\n")
421
- # anno = Obj.create(document, nref, s)
422
- #
423
- # page = self.replicate
424
- # page.add_annotation(nref)
425
- #
426
- # document.add(anno)
427
- # document.add(page)
428
- #
429
- # anno
430
- # end
431
-
432
438
  def insert_font(nick, obj_or_ref)
433
439
 
434
440
  fail ArgumentError.new("target '#{ref}' not a replica") \
@@ -462,9 +468,9 @@ module Podoff
462
468
  def recompute_attributes
463
469
 
464
470
  @attributes =
465
- OBJ_ATTRIBUTES.inject({}) do |h, (k, v)|
466
- m = @source.match(/\/#{v} (\/?[^\/\n<>]+)/)
467
- h[k] = m[1] if m
471
+ ATTRIBUTES.inject({}) do |h, (k, v)|
472
+ m = @source.match(/\/#{v}\s+(\/?[^\/\n<>]+)/)
473
+ h[k] = m[1].strip if m
468
474
  h
469
475
  end
470
476
  end
@@ -481,7 +487,7 @@ module Podoff
481
487
 
482
488
  fail ArgumentError.new("obj not replicated") unless @source
483
489
 
484
- pkey = OBJ_ATTRIBUTES[key]
490
+ pkey = ATTRIBUTES[key]
485
491
 
486
492
  if v = @attributes[key]
487
493
  v = concat(v, ref)
@@ -504,10 +510,6 @@ module Podoff
504
510
  @content = StringIO.new
505
511
  end
506
512
 
507
- #def document; obj.document; end
508
- #def ref; obj.ref; end
509
- #def source; self; end
510
-
511
513
  def tf(font_name, font_size)
512
514
 
513
515
  n = font_name[0] == '/' ? font_name[1..-1] : font_name
data/spec/core_spec.rb CHANGED
@@ -17,8 +17,8 @@ describe Podoff do
17
17
  d = Podoff.load('pdfs/t0.pdf')
18
18
 
19
19
  expect(d.class).to eq(Podoff::Document)
20
- expect(d.xref).to eq(414)
21
20
  expect(d.objs.keys).to eq([ '1 0', '2 0', '3 0', '4 0', '5 0', '6 0' ])
21
+ expect(d.xref).to eq(414)
22
22
 
23
23
  #pp d.objs.values.collect(&:to_a)
24
24
 
@@ -35,6 +35,8 @@ describe Podoff do
35
35
  [ 1, 1, 1, 1, 1, 1 ])
36
36
 
37
37
  expect(d.root).to eq('1 0')
38
+
39
+ expect(d.pages.size).to eq(1)
38
40
  end
39
41
 
40
42
  it 'loads a PDF document' do
@@ -48,6 +50,8 @@ describe Podoff do
48
50
  expect(d.objs.keys).to include('273 0')
49
51
 
50
52
  expect(d.root).to eq('65 0')
53
+
54
+ expect(d.pages.size).to eq(3)
51
55
  end
52
56
 
53
57
  it 'loads a PDF document with incremental updates' do
@@ -66,6 +70,25 @@ describe Podoff do
66
70
  expect(d.root).to eq('1 0')
67
71
  end
68
72
 
73
+ it 'loads a [re]compressed PDF documents' do
74
+
75
+ d = Podoff.load('pdfs/qdocument0.pdf')
76
+
77
+ expect(d.class).to eq(Podoff::Document)
78
+ expect(d.xref).to eq(1612815)
79
+ expect(d.objs.size).to eq(273)
80
+
81
+ expect(d.root).to eq('1 0')
82
+
83
+ #d.objs.each do |ref, o|
84
+ # p [ o.ref, o.attributes ]
85
+ #end
86
+
87
+ expect(d.pages.size).to eq(3)
88
+ expect(d.pages.first.attributes[:pagenum]).to eq('1')
89
+ expect(d.objs['46 0'].attributes[:type]).to eq('/Annot')
90
+ end
91
+
69
92
  it 'rejects items that are not PDF documents' do
70
93
 
71
94
  expect {
@@ -77,43 +77,6 @@ describe Podoff::Document do
77
77
  end
78
78
  end
79
79
 
80
- describe '#write' do
81
-
82
- it 'writes the document to a given path' do
83
-
84
- @d.write('tmp/out.pdf')
85
-
86
- s = File.open('tmp/out.pdf', 'r:iso8859-1') { |f| f.read }
87
- lines = s.split("\n")
88
-
89
- expect(lines.first).to match(/^%PDF-1.7$/)
90
- expect(lines.last).to match(/^%%EOF$/)
91
- end
92
-
93
- it 'writes open streams as well' do
94
-
95
- d = Podoff.load('pdfs/t0.pdf')
96
-
97
- pa = d.re_add(d.page(1))
98
- st = d.add_stream
99
- st.bt(10, 20, 'hello open stream')
100
- pa.insert_contents(st)
101
-
102
- s = d.write(:string)
103
-
104
- expect(
105
- d.write(:string).index(%{
106
- 7 0 obj
107
- << /Length 37 >>
108
- stream
109
- BT 10 20 Td (hello open stream) Tj ET
110
- endstream
111
- endobj
112
- }.strip)
113
- ).to eq(722)
114
- end
115
- end
116
-
117
80
  describe '#dup' do
118
81
 
119
82
  it 'produces a shallow copy of the document' do
@@ -272,6 +235,107 @@ BT /Helvetica 35 Tf 40 50 Td (sixty there) Tj ET
272
235
  expect(re.source).to eq(pa.source)
273
236
  expect(re.source).not_to equal(pa.source)
274
237
  end
238
+
239
+ it 'recomputes the attributes correctly' do
240
+
241
+ d = Podoff.load('pdfs/qdocument0.pdf')
242
+
243
+ pa = d.re_add(d.page(1))
244
+
245
+ expect(pa.attributes).to eq(
246
+ { type: '/Page', contents: '151 0 R', pagenum: '1' })
247
+ end
248
+ end
249
+ end
250
+
251
+ describe '#write' do
252
+
253
+ it 'writes the document to a given path' do
254
+
255
+ @d.write('tmp/out.pdf')
256
+
257
+ s = File.open('tmp/out.pdf', 'r:iso8859-1') { |f| f.read }
258
+ lines = s.split("\n")
259
+
260
+ expect(lines.first).to match(/^%PDF-1.7$/)
261
+ expect(lines.last).to match(/^%%EOF$/)
262
+ end
263
+
264
+ it 'writes open streams as well' do
265
+
266
+ d = Podoff.load('pdfs/t0.pdf')
267
+
268
+ pa = d.re_add(d.page(1))
269
+ st = d.add_stream
270
+ st.bt(10, 20, 'hello open stream')
271
+ pa.insert_contents(st)
272
+
273
+ s = d.write(:string)
274
+
275
+ expect(
276
+ d.write(:string).index(%{
277
+ 7 0 obj
278
+ << /Length 37 >>
279
+ stream
280
+ BT 10 20 Td (hello open stream) Tj ET
281
+ endstream
282
+ endobj
283
+ }.strip)
284
+ ).to eq(722)
285
+ end
286
+ end
287
+
288
+ describe '#rewrite' do
289
+
290
+ it 'rewrites a document in one go' do
291
+
292
+ d = Podoff.load('pdfs/t2.pdf')
293
+
294
+ s = d.rewrite(:string)
295
+
296
+ expect(s.strip).to eq(%{
297
+ %PDF-1.4
298
+ 1 0 obj <</Type /Catalog /Pages 2 0 R>>
299
+ endobj
300
+ 2 0 obj <</Type /Pages /Kids [3 0 R] /Count 1>>
301
+ endobj
302
+ 3 0 obj <</Type /Page /Parent 2 0 R /Resources 4 0 R /MediaBox [0 0 500 800] /Contents [6 0 R 7 0 R]>>
303
+ endobj
304
+ 4 0 obj <</Font <</F1 5 0 R>>>>
305
+ endobj
306
+ 5 0 obj <</Type /Font /Subtype /Type1 /BaseFont /Helvetica>>
307
+ endobj
308
+ 6 0 obj
309
+ <</Length 44>>
310
+ stream
311
+ BT /F1 24 Tf 175 720 Td (Hello Nadaa!)Tj ET
312
+ endstream
313
+ endobj
314
+ 7 0 obj
315
+ <</Length 44>>
316
+ stream
317
+ BT /F1 24 Tf 175 520 Td (Smurf Megane)Tj ET
318
+ endstream
319
+ endobj
320
+ xref
321
+ 0 1
322
+ 0000000000 65535 f
323
+ 1 7
324
+ 0000000010 00000 n
325
+ 0000000057 00000 n
326
+ 0000000112 00000 n
327
+ 0000000222 00000 n
328
+ 0000000261 00000 n
329
+ 0000000329 00000 n
330
+ 0000000420 00000 n
331
+ trailer
332
+ <<
333
+ /Size 7
334
+ /Root 1 0 R
335
+ >>
336
+ startxref 511
337
+ %%EOF
338
+ }.strip)
275
339
  end
276
340
  end
277
341
  end
data/spec/obj_spec.rb CHANGED
@@ -232,6 +232,21 @@ endobj
232
232
  expect(pa.source).to match(/\/Font\s+<<\s+\/MyHelv #{fo.ref} R\s+/)
233
233
  end
234
234
  end
235
+
236
+ describe '#add_to_attribute' do
237
+
238
+ it 'adds to a list of references' do
239
+
240
+ d = Podoff.load('pdfs/qdocument0.pdf')
241
+
242
+ o = d.re_add('56 0')
243
+
244
+ o.send(:add_to_attribute, :contents, '9999 0')
245
+
246
+ expect(o.attributes).to eq(
247
+ { type: '/Page', contents: '[151 0 R 9999 0 R]', pagenum: '1' })
248
+ end
249
+ end
235
250
  end
236
251
  end
237
252
 
data/todo.txt CHANGED
@@ -1,13 +1,21 @@
1
1
 
2
- [ ] stop using the st-mark thing
3
- [ ] doc.add_stream { |st| st.bt ... }
4
-
2
+ [o] implement Document#rewrite
3
+ [o] doc.add_stream { |st| st.bt ... }
5
4
  [o] st = d.addstream
6
5
  st.tf '/helv', 12
7
6
  st.bt 5, 6, "hello"
8
7
  d.write('out.pdf') # closes the stream...
9
8
  [ ] fail if insert_content or insert_font on an unclosed stream obj
10
9
 
10
+ [o] Document.write('-') (stdout inspiration)
11
+
12
+
13
+ [ ] stop using the st-mark thing
11
14
  [ ] recompress idea? uncompress with pdftk, recompress with podoff
12
- smaller docs anyway...
15
+ smaller docs anyway... /Filter /FlatDecode
16
+
17
+ decompress:
18
+ qpdf --qdf --object-streams=disable in.pdf out.pdf
19
+ recompress:
20
+ qpdf in.pdf out.pdf
13
21
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: podoff
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-10-23 00:00:00.000000000 Z
12
+ date: 2015-10-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake