podoff 0.9.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.txt CHANGED
@@ -2,6 +2,16 @@
2
2
  = podoff CHANGELOG.txt
3
3
 
4
4
 
5
+ == podoff 1.0.0 released 2015-10-23
6
+
7
+ - leverage incremental updates
8
+
9
+
10
+ == podoff 0.9.1 not released
11
+
12
+ - ensure Obj#contents accepts arrays
13
+
14
+
5
15
  == podoff 0.9.0 released 2015-10-21
6
16
 
7
17
  - beta release
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
 
2
2
  # podoff
3
3
 
4
+ [![Build Status](https://secure.travis-ci.org/jmettraux/podoff.png)](http://travis-ci.org/jmettraux/podoff)
5
+ [![Gem Version](https://badge.fury.io/rb/podoff.png)](http://badge.fury.io/rb/podoff)
6
+
4
7
  A Ruby tool to deface PDF documents.
5
8
 
6
9
  If you're looking for serious libraries, look at
data/lib/podoff.rb CHANGED
@@ -26,250 +26,521 @@
26
26
 
27
27
  module Podoff
28
28
 
29
- VERSION = '0.9.0'
29
+ VERSION = '1.0.0'
30
30
 
31
- def self.load(path)
31
+ def self.load(path, encoding='iso-8859-1')
32
32
 
33
- Podoff::Document.new(
34
- File.open(path, 'r:iso8859-1') { |f| f.read })
33
+ Podoff::Document.load(path, encoding)
35
34
  end
36
35
 
36
+ def self.parse(s)
37
+
38
+ Podoff::Document.new(s)
39
+ end
40
+
41
+ #OBJ_ATTRIBUTES =
42
+ # { type: 'Type', subtype: 'Subtype',
43
+ # parent: 'Parent', kids: 'Kids', contents: 'Contents', annots: 'Annots',
44
+ # pagenum: 'pdftk_PageNum' }
45
+ OBJ_ATTRIBUTES =
46
+ { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' }
47
+
37
48
  class Document
38
49
 
39
- attr_reader :header
50
+ def self.load(path, encoding='iso-8859-1')
51
+
52
+ Podoff::Document.new(File.open(path, 'r:' + encoding) { |f| f.read })
53
+ end
54
+
55
+ def self.parse(s)
56
+
57
+ Podoff::Document.new(s)
58
+ end
59
+
60
+ attr_reader :source
61
+ attr_reader :xref
40
62
  attr_reader :objs
41
- attr_reader :footer
63
+ attr_reader :obj_counters
64
+ attr_reader :root
65
+ #
66
+ attr_reader :additions
42
67
 
43
68
  def initialize(s)
44
69
 
45
70
  fail ArgumentError.new('not a PDF file') \
46
71
  unless s.match(/\A%PDF-\d+\.\d+\n/)
47
72
 
48
- @header = []
49
- #
73
+ @source = s
74
+ @xref = nil
50
75
  @objs = {}
51
- cur = nil
76
+ @obj_counters = {}
77
+ @root = nil
78
+
79
+ @additions = {}
80
+
81
+ index = 0
82
+ matches = {}
52
83
  #
53
- @footer = nil
54
-
55
- s.split("\n").each do |l|
56
-
57
- if @footer
58
- @footer << l
59
- elsif m = /^(\d+ \d+) obj\b/.match(l)
60
- cur = (@objs[m[1]] = Obj.new(self, m[1]))
61
- cur << l
62
- elsif m = /^xref\b/.match(l)
63
- @footer = []
64
- @footer << l
65
- elsif cur
66
- cur << l
84
+ loop do
85
+
86
+ matches[:obj] ||= s.match(/^(\d+ \d+) obj\b/, index)
87
+ matches[:endobj] ||= s.match(/\bendobj\b/, index)
88
+ #
89
+ OBJ_ATTRIBUTES.each do |k, v|
90
+ matches[k] ||= s.match(/\/#{v} (\/?[^\/\n<>]+)/, index)
91
+ end
92
+ #
93
+ matches[:startxref] ||= s.match(/\bstartxref\s+(\d+)\s*%%EOF/, index)
94
+
95
+ objm = matches[:obj]
96
+ sxrm = matches[:startxref]
97
+
98
+ break unless sxrm || objm
99
+
100
+ fail ArgumentError.new('failed to find "startxref"') unless sxrm
101
+
102
+ @root = nil if @root && index > @root.offset(0).last
103
+ @root ||= s.match(/\/Root (\d+ \d+) R\b/, index)
104
+
105
+ sxri = sxrm.offset(0).first
106
+ obji = objm ? objm.offset(0).first : sxri + 1
107
+
108
+ if obji < sxri
109
+ obj = Podoff::Obj.extract(self, matches)
110
+ @objs[obj.ref] = obj
111
+ @obj_counters[obj.ref] = (@obj_counters[obj.ref] || 0) + 1
112
+ index = obj.end_index + 1
67
113
  else
68
- @header << l
114
+ @xref = sxrm[1].to_i
115
+ index = sxrm.offset(0).last + 1
116
+ matches.delete(:startxref)
69
117
  end
70
118
  end
71
- end
72
119
 
73
- def fonts; @objs.values.select(&:is_font?); end
74
- def pages; @objs.values.select(&:is_page?); end
120
+ fail ArgumentError.new('found no /Root') unless @root
121
+ @root = @root[1]
122
+ end
75
123
 
76
- def page(i)
124
+ def updated?
77
125
 
78
- i < 1 ? nil : @objs.values.find { |o| o.page_number == i }
126
+ @additions.any?
79
127
  end
80
128
 
81
129
  def dup
82
130
 
83
- d0 = self
131
+ o = self
132
+
133
+ self.class.allocate.instance_eval do
84
134
 
85
- d = d0.class.allocate
135
+ @source = o.source
136
+ @xref = o.xref
86
137
 
87
- d.instance_eval do
88
- @header = d0.header.dup
89
- @footer = d0.footer.dup
90
- @objs = d0.objs.values.inject({}) { |h, v| h[v.ref] = v.dup(d); h }
138
+ @objs = o.objs.inject({}) { |h, (k, v)| h[k] = v.dup(self); h }
139
+ @obj_counters = o.obj_counters.dup
140
+
141
+ @root = o.root
142
+
143
+ @additions =
144
+ o.additions.inject({}) { |h, (k, v)| h[k] = v.dup(self); h }
145
+
146
+ self
91
147
  end
148
+ end
149
+
150
+ def pages
151
+
152
+ @objs.values.select { |o| o.type == '/Page' }
153
+ end
154
+
155
+ def page(index)
156
+
157
+ return nil if index == 0
158
+
159
+ pas = pages
160
+ return nil if pas.empty?
161
+
162
+ return (
163
+ index > 0 ? pas.at(index - 1) : pas.at(index)
164
+ ) unless pas.first.attributes[:pagenum]
165
+
166
+ if index < 0
167
+ max = pas.inject(0) { |n, pa| [ n, pa.page_number ].max }
168
+ index = max + 1 + index
169
+ end
170
+
171
+ pas.find { |pa| pa.page_number == index }
172
+ end
173
+
174
+ def new_ref
175
+
176
+ "#{
177
+ @objs.keys.inject(-1) { |i, r| [ i, r.split(' ').first.to_i ].max } + 1
178
+ } 0"
179
+ end
180
+
181
+ def add(obj)
182
+
183
+ @objs[obj.ref] = obj
184
+ @additions[obj.ref] = obj
185
+
186
+ obj
187
+ end
188
+
189
+ def add_base_font(name)
190
+
191
+ name = name[1..-1] if name[0] == '/'
192
+
193
+ ref = new_ref
194
+
195
+ add(
196
+ Obj.create(
197
+ self,
198
+ ref,
199
+ [
200
+ "#{ref} obj",
201
+ "<< /Type /Font /Subtype /Type1 /BaseFont /#{name} >>",
202
+ "endobj"
203
+ ].join(' ')))
204
+ end
205
+
206
+ def add_stream(s=nil, &block)
207
+
208
+ ref = new_ref
209
+
210
+ s = s || make_stream(&block)
211
+
212
+ s = [
213
+ "#{ref} obj",
214
+ "<< /Length #{s.length} >>",
215
+ "stream\n#{s}\nendstream",
216
+ "endobj"
217
+ ].join("\n") if s.is_a?(String)
218
+
219
+ o = add(Obj.create(self, ref, s))
220
+
221
+ s.is_a?(Podoff::Stream) ? s : o
222
+ end
223
+
224
+ def re_add(obj_or_ref)
225
+
226
+ obj = obj_or_ref.is_a?(String) ? @objs[obj_or_ref] : obj_or_ref
227
+
228
+ obj = obj.replicate unless obj.replica?
92
229
 
93
- d
230
+ add(obj)
94
231
  end
95
232
 
96
233
  def write(path)
97
234
 
98
- File.open(path, 'wb') do |f|
235
+ f = (path == :string) ? StringIO.new : File.open(path, 'wb')
99
236
 
100
- @header.each { |l| f.print(l); f.print("\n") }
237
+ f.write(@source)
101
238
 
102
- @objs.values.each do |o|
103
- o.lines.each { |l| f.print(l); f.print("\n") }
239
+ if @additions.any?
240
+
241
+ pointers = {}
242
+
243
+ @additions.values.each do |o|
244
+ f.write("\n")
245
+ pointers[o.ref] = f.pos + 1
246
+ if o.source.is_a?(String)
247
+ f.write(o.source)
248
+ else # Stream
249
+ s = o.source.to_s
250
+ f.write("#{o.ref} obj\n<< /Length #{s.length} >>\n")
251
+ f.write("stream\n#{s}\nendstream\nendobj")
252
+ end
253
+ end
254
+ f.write("\n\n")
255
+
256
+ xref = f.pos + 1
257
+
258
+ f.write("xref\n")
259
+ f.write("0 1\n")
260
+ f.write("0000000000 65535 f\n")
261
+
262
+ pointers.each do |k, v|
263
+ f.write("#{k.split(' ').first} 1\n")
264
+ f.write(sprintf("%010d 00000 n\n", v))
104
265
  end
105
266
 
106
- @footer.each { |l| f.print(l); f.print("\n") }
267
+ f.write("trailer\n")
268
+ f.write("<<\n")
269
+ f.write("/Prev #{self.xref}\n")
270
+ f.write("/Size #{objs.size}\n")
271
+ f.write("/Root #{root} R\n")
272
+ f.write(">>\n")
273
+ f.write("startxref #{xref}\n")
274
+ f.write("%%EOF\n")
107
275
  end
276
+
277
+ f.close
278
+
279
+ path == :string ? f.string : nil
280
+ end
281
+
282
+ private
283
+
284
+ def make_stream(&block)
285
+
286
+ s = Stream.new
287
+ s.instance_exec(&block) if block
288
+
289
+ s
108
290
  end
109
291
  end
110
292
 
111
293
  class Obj
112
294
 
295
+ def self.extract(doc, matches)
296
+
297
+ re = matches[:obj][1]
298
+ st = matches[:obj].offset(0).first
299
+ en = matches[:endobj].offset(0).last - 1
300
+
301
+ atts = {}
302
+
303
+ OBJ_ATTRIBUTES.keys.each do |k|
304
+ m = matches[k]
305
+ if m && m.offset(0).last < en
306
+ atts[k] = m[1].strip
307
+ matches.delete(k)
308
+ end
309
+ end
310
+
311
+ matches.delete(:obj)
312
+ matches.delete(:endobj)
313
+
314
+ Podoff::Obj.new(doc, re, st, en, atts)
315
+ end
316
+
113
317
  attr_reader :document
114
318
  attr_reader :ref
115
- attr_reader :lines
319
+ attr_reader :start_index, :end_index
320
+ attr_reader :attributes
116
321
 
117
- def initialize(doc, ref)
322
+ def initialize(doc, ref, st, en, atts, source=nil)
118
323
 
119
324
  @document = doc
120
325
  @ref = ref
121
- @lines = []
326
+ @start_index = st
327
+ @end_index = en
328
+ @attributes = atts
329
+ @source = source
330
+
331
+ recompute_attributes if @source.is_a?(String)
332
+ @source.obj = self if @source.is_a?(Podoff::Stream)
122
333
  end
123
334
 
124
- def <<(l)
335
+ def dup(new_doc)
125
336
 
126
- @lines << l
337
+ self.class.new(new_doc, ref, start_index, end_index, attributes.dup)
127
338
  end
128
339
 
129
- def lookup(k)
340
+ def self.create(doc, ref, source)
130
341
 
131
- @lines.each do |l|
342
+ self.new(doc, ref, nil, nil, nil, source)
343
+ end
132
344
 
133
- m = l.match(/^\/#{k} (.*)$/)
134
- return m[1] if m
135
- end
345
+ def replicate
136
346
 
137
- nil
347
+ self.class.create(document, ref, source.dup)
138
348
  end
139
349
 
140
- def index(o, start=0)
350
+ def to_a
141
351
 
142
- @lines[start..-1].each_with_index do |l, i|
352
+ [ @ref, @start_index, @end_index, @attributes ]
353
+ end
143
354
 
144
- if o.is_a?(String)
145
- return start + i if l == o
146
- else
147
- return start + i if l.match(o)
148
- end
149
- end
355
+ def source
150
356
 
151
- nil
357
+ @source || @document.source[@start_index..@end_index]
358
+ end
359
+
360
+ def replica?
361
+
362
+ @source != nil
152
363
  end
153
364
 
154
365
  def type
155
366
 
156
- t = lookup('Type')
157
- t ? t[1..-1] : nil
367
+ @attributes && @attributes[:type]
158
368
  end
159
369
 
160
370
  def page_number
161
371
 
162
- r = lookup('pdftk_PageNum')
372
+ r = @attributes && @attributes[:pagenum]
163
373
  r ? r.to_i : nil
164
374
  end
165
375
 
166
- def is_page?
376
+ # def parent
377
+ #
378
+ # r = @attributes[:parent]
379
+ # r ? r[0..-2].strip : nil
380
+ # end
381
+ #
382
+ # def kids
383
+ #
384
+ # r = @attributes[:kids]
385
+ # (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
386
+ # end
387
+ #
388
+ # def contents
389
+ #
390
+ # r = @attributes[:contents]
391
+ # (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
392
+ # end
167
393
 
168
- page_number != nil
169
- end
394
+ # def add_annotation(ref)
395
+ #
396
+ # if annots = @attributes[:annots]
397
+ # fail "implement me!"
398
+ # else
399
+ # i = @source.index('/Type ')
400
+ # @source.insert(i, "/Annots [#{ref} R]\n")
401
+ # end
402
+ # recompute_attributes
403
+ # end
404
+
405
+ # def add_free_text(x, y, text, font, size)
406
+ #
407
+ # fail ArgumentError.new('target is not a page') unless type == '/Page'
408
+ #
409
+ # nref = document.new_ref
410
+ #
411
+ # s = [
412
+ # "#{nref} obj <<",
413
+ # "/Type /Annot",
414
+ # "/Subtype /FreeText",
415
+ # "/Da (/F1 70 Tf 0 100 Td)",
416
+ # "/Rect [0 0 500 600]",
417
+ # "/Contents (#{text})",
418
+ # ">>",
419
+ # "endobj"
420
+ # ].join("\n")
421
+ # anno = Obj.create(document, nref, s)
422
+ #
423
+ # page = self.replicate
424
+ # page.add_annotation(nref)
425
+ #
426
+ # document.add(anno)
427
+ # document.add(page)
428
+ #
429
+ # anno
430
+ # end
170
431
 
171
- def is_font?
432
+ def insert_font(nick, obj_or_ref)
172
433
 
173
- type() == 'Font'
434
+ fail ArgumentError.new("target '#{ref}' not a replica") \
435
+ unless @source
436
+
437
+ nick = nick[1..-1] if nick[0] == '/'
438
+
439
+ re = obj_or_ref
440
+ re = re.ref if re.respond_to?(:ref)
441
+
442
+ @source = @source.gsub(/\/Font\s*<</, "/Font\n<<\n/#{nick} #{re} R")
174
443
  end
175
444
 
176
- def parent
445
+ def insert_contents(obj_or_ref)
177
446
 
178
- # /Parent 2 0 R
447
+ fail ArgumentError.new("target '#{ref}' not a replica") \
448
+ unless @source
449
+ fail ArgumentError.new("target '#{ref}' doesn't have /Contents") \
450
+ unless @attributes[:contents]
179
451
 
180
- r = lookup('Parent')
452
+ re = obj_or_ref
453
+ re = re.obj if re.respond_to?(:obj) # Stream
454
+ re = re.ref if re.respond_to?(:ref)
181
455
 
182
- r ? r[0..-2].strip : nil
456
+ add_to_attribute(:contents, re)
183
457
  end
458
+ alias :insert_content :insert_contents
184
459
 
185
- def kids
460
+ protected
186
461
 
187
- # /Kids [1 0 R 16 0 R 33 0 R]
462
+ def recompute_attributes
188
463
 
189
- r = lookup('Kids')
190
- (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?)
464
+ @attributes =
465
+ OBJ_ATTRIBUTES.inject({}) do |h, (k, v)|
466
+ m = @source.match(/\/#{v} (\/?[^\/\n<>]+)/)
467
+ h[k] = m[1] if m
468
+ h
469
+ end
191
470
  end
192
471
 
193
- def contents
472
+ def concat(refs, ref)
194
473
 
195
- r = lookup('Contents')
196
- r ? r[0..-2].strip : nil
474
+ refs = refs.strip
475
+ refs = refs[1..-2] if refs[0] == '['
476
+
477
+ "[#{refs} #{ref} R]"
197
478
  end
198
479
 
199
- def font_names
480
+ def add_to_attribute(key, ref)
200
481
 
201
- @lines.inject(nil) do |names, l|
482
+ fail ArgumentError.new("obj not replicated") unless @source
202
483
 
203
- if names
204
- return names if l == '>>'
205
- if m = l.match(/\/([^ ]+) /); names << m[1]; end
206
- elsif l.match(/\/Font\s*$/)
207
- names = []
208
- end
484
+ pkey = OBJ_ATTRIBUTES[key]
209
485
 
210
- names
486
+ if v = @attributes[key]
487
+ v = concat(v, ref)
488
+ @source = @source.gsub(/#{pkey} ([\[\]0-9 R]+)/, "#{pkey} #{v}")
489
+ else
490
+ i = @source.index('/Type ')
491
+ @source.insert(i, "/#{pkey} [#{ref} R]\n")
211
492
  end
212
-
213
- []
493
+ recompute_attributes
214
494
  end
495
+ end
215
496
 
216
- def dup(new_doc)
497
+ class Stream
498
+
499
+ attr_accessor :obj
217
500
 
218
- o0 = self
219
- o = o0.class.new(new_doc, @ref)
220
- o.instance_eval { @lines = o0.lines.dup }
501
+ def initialize
221
502
 
222
- o
503
+ @font = nil
504
+ @content = StringIO.new
223
505
  end
224
506
 
225
- def find(opts={}, &block)
507
+ #def document; obj.document; end
508
+ #def ref; obj.ref; end
509
+ #def source; self; end
226
510
 
227
- return self if block.call(self)
511
+ def tf(font_name, font_size)
228
512
 
229
- [ *kids, contents ].compact.each do |k|
230
- o = @document.objs[k]
231
- return o if o && block.call(o)
232
- end
513
+ n = font_name[0] == '/' ? font_name[1..-1] : font_name
233
514
 
234
- nil
515
+ @font = "/#{n} #{font_size} Tf "
235
516
  end
517
+ alias :font :tf
236
518
 
237
- def crop_box
238
-
239
- r = lookup('CropBox') || lookup('MediaBox')
519
+ def bt(x, y, text)
240
520
 
241
- r ? r.strip[1..-2].split(' ').collect(&:strip).collect(&:to_f) : nil
521
+ @content.write "\n" if @content.size > 0
522
+ @content.write "BT "
523
+ @content.write @font if @font
524
+ @content.write "#{x} #{y} Td (#{escape(text)}) Tj"
525
+ @content.write " ET"
242
526
  end
527
+ alias :text :bt
243
528
 
244
- def crop_dims
529
+ def write(text)
245
530
 
246
- x, y, w, h = crop_box
247
-
248
- x ? [ w - x, h - y ] : nil
531
+ @content.write(text)
249
532
  end
250
533
 
251
- def prepend_text(x, y, text, opts={})
252
-
253
- o = find { |o| o.index('BT') }
254
- fail ArgumentError.new('found no BT in the tree') unless o
534
+ def to_s
255
535
 
256
- font = opts[:font] || o.font_names.first || 'TT0'
257
- size = opts[:size] || 10
258
- comm = opts[:comment]
536
+ @content.string
537
+ end
259
538
 
260
- i = o.index('BT')
261
- bt = []
262
- bt << 'BT'
263
- bt << "#{x} #{y} Td"
264
- bt << "/#{font} #{size} Tf"
265
- bt << "(#{text})Tj"
266
- bt << 'ET'
267
- bt << " % #{comm}" if comm
268
- bt = bt.join(' ')
539
+ protected
269
540
 
270
- o.lines.insert(i, bt)
541
+ def escape(s)
271
542
 
272
- o
543
+ s.gsub(/\(/, '\(').gsub(/\)/, '\)')
273
544
  end
274
545
  end
275
546
  end