ruby-xz 0.2.2 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/xz/stream.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #--
3
- # (The MIT license)
4
- #
5
3
  # Basic liblzma-bindings for Ruby.
6
4
  #
7
- # Copyright © 2012, 2015 Marvin Gülker
5
+ # Copyright © 2011-2018 Marvin Gülker et al.
6
+ #
7
+ # See AUTHORS for the full list of contributors.
8
8
  #
9
9
  # Permission is hereby granted, free of charge, to any person obtaining a
10
10
  # copy of this software and associated documentation files (the ‘Software’),
@@ -25,44 +25,441 @@
25
25
  # THE SOFTWARE.
26
26
  #++
27
27
 
28
- # The base class for XZ::StreamReader and XZ::StreamWriter.
29
- # This is an abstract class that is not meant to be used
30
- # directly; if you try, you will soon recognise that you’ve
31
- # created a quite limited object ;-). You can, however, test
32
- # against this class in <tt>kind_of?</tt> tests.
28
+ # The base class for XZ::StreamReader and XZ::StreamWriter. This is
29
+ # an abstract class that is not meant to be used directly. You can,
30
+ # however, test against this class in <tt>kind_of?</tt> tests.
31
+ #
32
+ # XZ::StreamReader and XZ::StreamWriter are IO-like classes that allow
33
+ # you to access XZ-compressed data the same way you access an
34
+ # IO-object, easily allowing to fool other libraries that expect IO
35
+ # objects. The most noticable example for this may be reading and
36
+ # writing XZ-compressed tarballs using the minitar
37
+ # RubyGem; see the README.md file for an example.
38
+ #
39
+ # Most of IO's methods are implemented in this class or one of the
40
+ # subclasses. The most notable exception is that it is not possible
41
+ # to seek in XZ archives (#seek and #pos= are not defined).
42
+ # Many methods that are not expressly documented in the RDoc
43
+ # still exist; this class uses Ruby's Forwardable module to forward
44
+ # them to the underlying IO object.
33
45
  #
34
- # XZ::StreamReader and XZ::StreamWriter are IO-like classes that
35
- # allow you to access XZ-compressed data the same way you access
36
- # an IO-object, easily allowing to fool other libraries that expect
37
- # IO objects. The most noticable example for this may be reading
38
- # and writing XZ-compressed tarballs; see XZ::StreamReader and
39
- # XZ::StreamWriter for respective examples.
46
+ # Stream and its subclasses honour Ruby's external+internal encoding
47
+ # system just like Ruby's own IO does. All of what the Ruby docs say
48
+ # about external and internal encodings applies to this class with one
49
+ # important difference. The "external encoding" does not refer to the
50
+ # encoding of the file on the hard disk (this file is always a binary
51
+ # file as it's compressed data), but to the encoding of the
52
+ # decompressed data inside the compressed file.
40
53
  #
41
- # Neither this class nor its subclasses document the IO-methods
42
- # they contain--this is due to the reason that they include the
43
- # great IO::Like module that provides all the necessary IO methods
44
- # based on a few methods you define. For all defined IO methods,
45
- # see the +io-like+ gem’s documentation.
54
+ # As with Ruby's IO class, instances of this class and its subclasses
55
+ # default their external encoding to Encoding.default_external and
56
+ # their internal encoding to Encoding.default_internal. You can use
57
+ # #set_encoding or pass appropriate arguments to the +new+ method to
58
+ # change these encodings per-instance.
46
59
  class XZ::Stream
47
- include IO::Like
60
+ extend Forwardable
61
+
62
+ def_delegator :@delegate_io, :"autoclose="
63
+ def_delegator :@delegate_io, :"autoclose?"
64
+ def_delegator :@delegate_io, :binmode
65
+ def_delegator :@delegate_io, :"binmode?"
66
+ def_delegator :@delegate_io, :"close_on_exec="
67
+ def_delegator :@delegate_io, :"close_on_exec?"
68
+ def_delegator :@delegate_io, :fcntl
69
+ def_delegator :@delegate_io, :fdatasync
70
+ def_delegator :@delegate_io, :fileno
71
+ def_delegator :@delegate_io, :to_i
72
+ def_delegator :@delegate_io, :flush # TODO: liblzma might have its own flush method that should be used
73
+ def_delegator :@delegate_io, :fsync
74
+ def_delegator :@delegate_io, :ioctl
75
+ def_delegator :@delegate_io, :isatty
76
+ def_delegator :@delegate_io, :pid
77
+ #def_delegator :@delegate_io, :stat # If this is available the minitar gem thinks it's a File and wants to seek it O_o
78
+ def_delegator :@delegate_io, :sync # TODO: use liblzma's own syncing functionality?
79
+ def_delegator :@delegate_io, :"sync=" # TODO: use liblzma's own syncing functionality?
80
+ def_delegator :@delegate_io, :"tty?"
81
+
82
+ # Like IO#lineno and IO#lineno=.
83
+ attr_accessor :lineno
84
+
85
+ # Returns the encoding used inside the compressed data stream.
86
+ # Like IO#external_encoding.
87
+ attr_reader :external_encoding
88
+
89
+ # When compressed data is read, the decompressed data is transcoded
90
+ # from the external_encoding to this encoding. If this encoding is
91
+ # nil, no transcoding happens.
92
+ attr_reader :internal_encoding
93
+
94
+ # Private API only for use by subclasses.
95
+ def initialize(delegate_io) # :nodoc:
96
+ @delegate_io = delegate_io
97
+ @lzma_stream = XZ::LibLZMA::LZMAStream.malloc
98
+ XZ::LibLZMA::LZMA_STREAM_INIT(@lzma_stream)
99
+
100
+ @finished = false
101
+ @lineno = 0
102
+ @pos = 0
103
+ @external_encoding = Encoding.default_external
104
+ @internal_encoding = Encoding.default_internal
105
+ @transcode_options = {}
106
+ @input_buffer_p = Fiddle::Pointer.malloc(XZ::CHUNK_SIZE)
107
+ @output_buffer_p = Fiddle::Pointer.malloc(XZ::CHUNK_SIZE)
108
+ end
109
+
110
+ # Pass the given +str+ into libzlma's lzma_code() function.
111
+ # +action+ is either LibLZMA::LZMA_RUN (still working) or
112
+ # LibLZMA::LZMA_FINISH (this is the last piece).
113
+ def lzma_code(str, action) # :nodoc:
114
+ previous_encoding = str.encoding
115
+ str.force_encoding(Encoding::BINARY) # Need to operate on bytes now
116
+
117
+ begin
118
+ pos = 0
119
+ until pos > str.bytesize # Do not use >=, that conflicts with #lzma_finish
120
+ substr = str[pos, XZ::CHUNK_SIZE]
121
+ @input_buffer_p[0, substr.bytesize] = substr
122
+ pos += XZ::CHUNK_SIZE
123
+
124
+ @lzma_stream.next_in = @input_buffer_p
125
+ @lzma_stream.avail_in = substr.bytesize
126
+
127
+ loop do
128
+ @lzma_stream.next_out = @output_buffer_p
129
+ @lzma_stream.avail_out = XZ::CHUNK_SIZE
130
+ res = XZ::LibLZMA.lzma_code(@lzma_stream.to_ptr, action)
131
+ XZ.send :check_lzma_code_retval, res # call package-private method
132
+
133
+ data = @output_buffer_p[0, XZ::CHUNK_SIZE - @lzma_stream.avail_out]
134
+ yield(data)
135
+
136
+ break unless @lzma_stream.avail_out == 0
137
+ end
138
+ end
139
+ ensure
140
+ str.force_encoding(previous_encoding)
141
+ end
142
+ end
143
+
144
+ # Partial implementation of +rewind+ abstracting common operations.
145
+ # The subclasses implement the rest.
146
+ def rewind # :nodoc:
147
+ # Free the current lzma stream and rewind the underlying IO.
148
+ # It is required to call #rewind before allocating a new lzma
149
+ # stream, because if #rewind raises an exception (because the
150
+ # underlying IO is not rewindable), a memory leak would occur
151
+ # with regard to an allocated-but-never-freed lzma stream.
152
+ finish
153
+ @delegate_io.rewind
154
+
155
+ # Reset internal state
156
+ @pos = @lineno = 0
157
+ @finished = false
158
+
159
+ # Allocate a new lzma stream (subclasses will configure it).
160
+ @lzma_stream = XZ::LibLZMA::LZMAStream.malloc
161
+ XZ::LibLZMA::LZMA_STREAM_INIT(@lzma_stream)
162
+
163
+ 0 # Mimic IO#rewind's return value
164
+ end
165
+
166
+ # You can mostly treat this as if it were an IO object.
167
+ # At least for subclasses. This class itself is abstract,
168
+ # you shouldn't be using it directly at all.
169
+ #
170
+ # Returns the receiver.
171
+ def to_io
172
+ self
173
+ end
174
+
175
+ # Overridden in StreamReader to be like IO#eof?.
176
+ # This abstract implementation only raises IOError.
177
+ def eof?
178
+ raise(IOError, "Stream not opened for reading")
179
+ end
180
+
181
+ # Alias for #eof?
182
+ def eof
183
+ eof?
184
+ end
185
+
186
+ # True if the delegate IO has been closed.
187
+ def closed?
188
+ @delegate_io.closed?
189
+ end
190
+
191
+ # True if liblzma's internal memory has been freed. For writer
192
+ # instances, receiving true from this method also means that all
193
+ # of liblzma's compressed data has been flushed to the underlying
194
+ # IO object.
195
+ def finished?
196
+ @finished
197
+ end
198
+
199
+ # Free internal libzlma memory. This needs to be called before
200
+ # you leave this object for the GC. If you used a block-form
201
+ # initializer, this done automatically for you.
202
+ #
203
+ # Subsequent calls to #read or #write will cause an IOError.
204
+ #
205
+ # Returns the underlying IO object. This allows you to retrieve
206
+ # the File instance that was automatically created when using
207
+ # the +open+ method's block form.
208
+ def finish
209
+ return if @finished
210
+
211
+ # Clean up the lzma_stream structure's internal memory.
212
+ # This would belong into a destructor if Ruby had that.
213
+ XZ::LibLZMA.lzma_end(@lzma_stream)
214
+ @finished = true
215
+
216
+ @delegate_io
217
+ end
218
+
219
+
220
+ # If not done yet, call #finish. Then close the delegate IO.
221
+ # The latter action is going to cause the delegate IO to
222
+ # flush its buffer. After this method returns, it is guaranteed
223
+ # that all pending data has been flushed to the OS' kernel.
224
+ def close
225
+ finish unless @finished
226
+ @delegate_io.close unless @delegate_io.closed?
227
+ nil
228
+ end
229
+
230
+ # Always raises IOError, because XZ streams can never be duplex.
231
+ def close_read
232
+ raise(IOError, "Not a duplex I/O stream")
233
+ end
234
+
235
+ # Always raises IOError, because XZ streams can never be duplex.
236
+ def close_write
237
+ raise(IOError, "Not a duplex I/O stream")
238
+ end
239
+
240
+ # Overridden in StreamReader to be like IO#read.
241
+ # This abstract implementation only raises IOError.
242
+ def read(*args)
243
+ raise(IOError, "Stream not opened for reading")
244
+ end
245
+
246
+ # Overridden in StreamWriter to be like IO#write.
247
+ # This abstract implementation only raises IOError.
248
+ def write(*args)
249
+ raise(IOError, "Stream not opened for writing")
250
+ end
251
+
252
+ # Returns the position in the *decompressed* data (regardless of
253
+ # whether this is a reader or a writer instance).
254
+ def pos
255
+ @pos
256
+ end
257
+ alias tell pos
258
+
259
+ # Like IO#set_encoding.
260
+ def set_encoding(*args)
261
+ if args.count < 1 || args.count > 3
262
+ raise ArgumentError, "Wrong number of arguments: Expected 1-3, got #{args.count}"
263
+ end
264
+
265
+ # Clean `args' to [external_encoding, internal_encoding],
266
+ # and @transcode_options.
267
+ return set_encoding($`, $', *args[1..-1]) if args[0].respond_to?(:to_str) && args[0].to_str =~ /:/
268
+ @transcode_options = args.delete_at(-1) if args[-1].kind_of?(Hash)
269
+
270
+ # `args' is always [external, internal] or [external] at this point
271
+ @external_encoding = args[0].kind_of?(Encoding) ? args[0] : Encoding.find(args[0])
272
+ if args[1]
273
+ @internal_encoding = args[1].kind_of?(Encoding) ? args[1] : Encoding.find(args[1])
274
+ else
275
+ @internal_encoding = Encoding.default_internal # Encoding.default_internal defaults to nil
276
+ end
277
+
278
+ self
279
+ end
280
+
281
+ # Do not define #pos= and #seek, not even to throw NotImplementedError.
282
+ # Reason: The minitar gem thinks it can use this methods then and provokes
283
+ # the NotImplementedError exception.
284
+
285
+ # Like IO#<<.
286
+ def <<(obj)
287
+ write(obj.to_s)
288
+ end
289
+
290
+ # Like IO#advise. No-op, because not meaningful on compressed data.
291
+ def advise
292
+ nil
293
+ end
294
+
295
+ # Like IO#getbyte. Note this method isn't exactly performant,
296
+ # because it actually reads compressed data as a string and then
297
+ # needs to figure out the bytes from that again.
298
+ def getbyte
299
+ return nil if eof?
300
+ read(1).bytes.first
301
+ end
302
+
303
+ # Like IO#readbyte.
304
+ def readbyte
305
+ getbyte || raise(EOFError, "End of stream reached")
306
+ end
307
+
308
+ # Like IO#getc.
309
+ def getc
310
+ str = String.new
311
+
312
+ # Read byte-by-byte until a valid character in the external
313
+ # encoding was built.
314
+ loop do
315
+ str.force_encoding(Encoding::BINARY)
316
+ str << read(1)
317
+ str.force_encoding(@external_encoding)
318
+
319
+ break if str.valid_encoding? || eof?
320
+ end
321
+
322
+ # Transcode to internal encoding if one was requested
323
+ if @internal_encoding
324
+ str.encode(@internal_encoding)
325
+ else
326
+ str
327
+ end
328
+ end
329
+
330
+ # Like IO#readchar.
331
+ def readchar
332
+ getc || raise(EOFError, "End of stream reached")
333
+ end
334
+
335
+ # Like IO#gets.
336
+ def gets(separator = $/, limit = nil)
337
+ return nil if eof?
338
+ @lineno += 1
339
+
340
+ # Mirror IO#gets' weird call-seq
341
+ if separator.respond_to?(:to_int)
342
+ limit = separator.to_int
343
+ separator = $/
344
+ end
345
+
346
+ buf = String.new
347
+ buf.force_encoding(target_encoding)
348
+ until eof? || (limit && buf.length >= limit)
349
+ buf << getc
350
+ return buf if buf[-1] == separator
351
+ end
352
+
353
+ buf
354
+ end
355
+
356
+ # Like IO#readline.
357
+ def readline(*args)
358
+ gets(*args) || raise(EOFError, "End of stream reached")
359
+ end
360
+
361
+ # Like IO#each.
362
+ def each(*args)
363
+ return enum_for __method__ unless block_given?
364
+
365
+ while line = gets(*args)
366
+ yield(line)
367
+ end
368
+ end
369
+ alias each_line each
370
+
371
+ # Like IO#each_byte.
372
+ def each_byte
373
+ return enum_for __method__ unless block_given?
374
+
375
+ while byte = getbyte
376
+ yield(byte)
377
+ end
378
+ end
379
+
380
+ # Like IO#each_char.
381
+ def each_char
382
+ return enum_for __method__ unless block_given?
383
+
384
+ while char = getc
385
+ yield(char)
386
+ end
387
+ end
388
+
389
+ # Like IO#each_codepoint.
390
+ def each_codepoint
391
+ return enum_for __method__ unless block_given?
392
+
393
+ each_char{|c| yield(c.ord)}
394
+ end
395
+
396
+ # Like IO#printf.
397
+ def printf(*args)
398
+ write(sprintf(*args))
399
+ nil
400
+ end
401
+
402
+ # Like IO#putc.
403
+ def putc(obj)
404
+ if obj.respond_to? :chr
405
+ write(obj.chr)
406
+ elsif obj.respond_to? :to_str
407
+ write(obj.to_str)
408
+ else
409
+ raise(TypeError, "Can only #putc strings and numbers")
410
+ end
411
+ end
412
+
413
+ def puts(*objs)
414
+ if objs.empty?
415
+ write("\n")
416
+ return nil
417
+ end
418
+
419
+ objs.each do |obj|
420
+ if obj.respond_to? :to_ary
421
+ puts(*obj.to_ary)
422
+ else
423
+ # Don't squeeze multiple subsequent trailing newlines in `obj'
424
+ obj = obj.to_s
425
+ if obj.end_with?("\n".encode(obj.encoding))
426
+ write(obj)
427
+ else
428
+ write(obj + "\n".encode(obj.encoding))
429
+ end
430
+ end
431
+ end
432
+ nil
433
+ end
434
+
435
+ # Like IO#print.
436
+ def print(*objs)
437
+ if objs.empty?
438
+ write($_)
439
+ else
440
+ objs.each do |obj|
441
+ write(obj.to_s)
442
+ write($,) if $,
443
+ end
444
+ end
445
+
446
+ write($\) if $\
447
+ nil
448
+ end
48
449
 
49
- # Creates a new instance of this class. Don’t use this directly,
50
- # it’s only called by subclasses’ ::new methods.
51
- def initialize(delegate_io)
52
- @delegate_io = delegate_io
53
- @lzma_stream = XZ::LZMAStream.new
450
+ # It is not possible to reopen an lzma stream, hence this
451
+ # method always raises NotImplementedError.
452
+ def reopen(*args)
453
+ raise(NotImplementedError, "Can't reopen an lzma stream")
54
454
  end
55
455
 
56
456
  private
57
457
 
58
- # This method returns the size of +str+ in bytes.
59
- def binary_size(str)
60
- # Believe it or not, but this is faster than str.bytes.to_a.size.
61
- # I benchmarked it, and it is as twice as fast.
62
- if str.respond_to? :force_encoding
63
- str.dup.force_encoding(Encoding::BINARY).size
458
+ def target_encoding
459
+ if @internal_encoding
460
+ @internal_encoding
64
461
  else
65
- str.bytes.to_a.size
462
+ @external_encoding
66
463
  end
67
464
  end
68
465