zim-ruby 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ # About
2
+
3
+ zim-ruby is a ruby library to read openzim (http://openzim.org) files.
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH << '../lib'
4
+
5
+ require 'zim'
6
+
7
+ if ARGV.size != 2
8
+ puts "unpack.rb <zimfile> <directory>"
9
+ exit 1
10
+ end
11
+
12
+ out = ARGV[1]
13
+
14
+ f = Zim::ZimFile.new(ARGV[0])
15
+ f.urls.each do |url|
16
+ puts url
17
+ Dir.mkdir("#{out}/#{url.namespace}") unless File.directory? "#{out}/#{url.namespace}"
18
+ File.new("#{out}#{url.to_s}", 'w').write(url.blob)
19
+ end
20
+
@@ -0,0 +1,462 @@
1
+ #Encoding: UTF-8
2
+ =begin (The MIT License)
3
+
4
+ Basic liblzma-bindings for Ruby.
5
+
6
+ Copyright © 2011 Marvin Gülker
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a
9
+ copy of this software and associated documentation files (the ‘Software’),
10
+ to deal in the Software without restriction, including without limitation
11
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
+ and/or sell copies of the Software, and to permit persons to whom the Software
13
+ is furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ THE SOFTWARE.
25
+ =end
26
+
27
+ require "ffi"
28
+ require 'stringio'
29
+
30
+ #The namespace and main module of this library. Each method of this module
31
+ #may raise exceptions of class XZ::LZMAError, which is not named in the
32
+ #methods' documentations anymore.
33
+ module XZ
34
+
35
+ #This module wraps functions and enums used by liblzma.
36
+ module LibLZMA
37
+ extend FFI::Library
38
+
39
+ #The maximum value of an uint64_t, as defined by liblzma.
40
+ #Should be the same as
41
+ # (2 ** 64) - 1
42
+ UINT64_MAX = 18446744073709551615
43
+
44
+ #Activates extreme compression. Same as xz's "-e" commandline switch.
45
+ LZMA_PRESET_EXTREME = 1 << 31
46
+
47
+ LZMA_TELL_NO_CHECK = 0x02
48
+ LZMA_TELL_UNSUPPORTED_CHECK = 0x02
49
+ LZMA_TELL_ANY_CHECK = 0x04
50
+ LZMA_CONCATENATED = 0x08
51
+
52
+ #Placeholder enum used by liblzma for later additions.
53
+ LZMA_RESERVED_ENUM = enum :lzma_reserved_enum, 0
54
+
55
+ #Actions that can be passed to the lzma_code() function.
56
+ LZMA_ACTION = enum :lzma_run, 0,
57
+ :lzma_sync_flush,
58
+ :lzma_full_flush,
59
+ :lzma_finish
60
+
61
+ #Integrity check algorithms supported by liblzma.
62
+ LZMA_CHECK = enum :lzma_check_none, 0,
63
+ :lzma_check_crc32, 1,
64
+ :lzma_check_crc64, 4,
65
+ :lzma_check_sha256, 10
66
+
67
+ #Possible return values of liblzma functions.
68
+ LZMA_RET = enum :lzma_ok, 0,
69
+ :lzma_stream_end,
70
+ :lzma_no_check,
71
+ :lzma_unsupported_check,
72
+ :lzma_get_check,
73
+ :lzma_mem_error,
74
+ :lzma_memlimit_error,
75
+ :lzma_format_error,
76
+ :lzma_options_error,
77
+ :lzma_data_error,
78
+ :lzma_buf_error,
79
+ :lzma_prog_error
80
+
81
+ ffi_lib ['lzma.so.2', 'lzma.so', 'lzma']
82
+
83
+ attach_function :lzma_easy_encoder, [:pointer, :uint32, :int], :int
84
+ attach_function :lzma_code, [:pointer, :int], :int
85
+ attach_function :lzma_stream_decoder, [:pointer, :uint64, :uint32], :int
86
+ attach_function :lzma_end, [:pointer], :void
87
+
88
+ end
89
+
90
+ #The class of the error that this library raises.
91
+ class LZMAError < StandardError
92
+
93
+ #Raises an appropriate exception if +val+ isn't a liblzma success code.
94
+ def self.raise_if_necessary(val)
95
+ case val
96
+ when :lzma_mem_error then raise(self, "Couldn't allocate memory!")
97
+ when :lzma_memlimit_error then raise(self, "Decoder ran out of (allowed) memory!")
98
+ when :lzma_format_error then raise(self, "Unrecognized file format!")
99
+ when :lzma_options_error then raise(self, "Invalid options passed!")
100
+ when :lzma_data_error then raise raise(self, "Archive is currupt.")
101
+ when :lzma_buf_error then raise(self, "Buffer unusable!")
102
+ when :lzma_prog_error then raise(self, "Program error--if you're sure your code is correct, you may have found a bug in liblzma.")
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ #The main struct of the liblzma library.
109
+ class LZMAStream < FFI::Struct
110
+ layout :next_in, :pointer, #uint8
111
+ :avail_in, :size_t,
112
+ :total_in, :uint64,
113
+ :next_out, :pointer, #uint8
114
+ :avail_out, :size_t,
115
+ :total_out, :uint64,
116
+ :lzma_allocator, :pointer,
117
+ :lzma_internal, :pointer,
118
+ :reserved_ptr1, :pointer,
119
+ :reserved_ptr2, :pointer,
120
+ :reserved_ptr3, :pointer,
121
+ :reserved_ptr4, :pointer,
122
+ :reserved_int1, :uint64,
123
+ :reserved_int2, :uint64,
124
+ :reserved_int3, :size_t,
125
+ :reserved_int4, :size_t,
126
+ :reserved_enum1, :int,
127
+ :reserved_enum2, :int
128
+
129
+ #This method does basicly the same thing as the
130
+ #LZMA_STREAM_INIT macro of liblzma. Creates a new LZMAStream
131
+ #that has been initialized for usage. If any argument is passed,
132
+ #it is assumed to be a FFI::Pointer to a lzma_stream structure
133
+ #and that structure is wrapped.
134
+ def initialize(*args)
135
+ if args.empty? #Got a pointer, want to wrap it
136
+ super
137
+ else
138
+ s = super()
139
+ s[:next] = nil
140
+ s[:avail_in] = 0
141
+ s[:total_in] = 0
142
+ s[:next_out] = nil
143
+ s[:avail_out] = 0
144
+ s[:total_out] = 0
145
+ s[:lzma_allocator] = nil
146
+ s[:lzma_internal] = nil
147
+ s[:reserved_ptr1] = nil
148
+ s[:reserved_ptr2] = nil
149
+ s[:reserved_ptr3] = nil
150
+ s[:reserved_ptr4] = nil
151
+ s[:reserved_int1] = 0
152
+ s[:reserved_int2] = 0
153
+ s[:reserved_int3] = 0
154
+ s[:reserved_int4] = 0
155
+ s[:reserved_enum1] = LibLZMA::LZMA_RESERVED_ENUM[:lzma_reserved_enum]
156
+ s[:reserved_enum2] = LibLZMA::LZMA_RESERVED_ENUM[:lzma_reserved_enum]
157
+ s
158
+ end
159
+ end
160
+ end
161
+
162
+ #Number of bytes read in one chunk.
163
+ CHUNK_SIZE = 4096
164
+ #The version of this library.
165
+ VERSION = "0.0.2".freeze
166
+
167
+ class << self
168
+
169
+ #call-seq:
170
+ # decompress_stream(io [, memory_limit [, flags ] ] ) → a_string
171
+ # decompress_stream(io [, memory_limit [, flags ] ] ){|chunk| ... } → an_integer
172
+ # decode_stream(io [, memory_limit [, flags ] ] ) → a_string
173
+ # decode_stream(io [, memory_limit [, flags ] ] ){|chunk| ... } → an_integer
174
+ #
175
+ #Decompresses a stream containing XZ-compressed data.
176
+ #===Parameters
177
+ #[io] The IO to read from. It must be opened for reading.
178
+ #[memory_limit] (+UINT64_MAX+) If not XZ::LibLZMA::UINT64_MAX, makes liblzma
179
+ # use no more memory than +memory_limit+ bytes.
180
+ #[flags] (<tt>[:tell_unsupported_check]</tt>) Additional flags
181
+ # passed to liblzma (an array). Possible flags are:
182
+ # [:tell_no_check] Spit out a warning if the archive hasn't an
183
+ # itnegrity checksum.
184
+ # [:tell_unsupported_check] Spit out a warning if the archive
185
+ # has an unsupported checksum type.
186
+ # [:concatenated] Decompress concatenated archives.
187
+ #[chunk] (Block argument) One piece of decompressed data.
188
+ #===Return value
189
+ #If a block was given, returns the number of bytes written. Otherwise,
190
+ #returns the decompressed data as a BINARY-encoded string.
191
+ #===Example
192
+ # data = File.open("archive.xz", "rb"){|f| f.read}
193
+ # io = StringIO.new(data)
194
+ # XZ.decompress_stream(io) #=> "I AM THE DATA"
195
+ # io.rewind
196
+ # str = ""
197
+ # XZ.decompress_stream(io, XZ::LibLZMA::UINT64_MAX, [:tell_no_check]){|c| str << c} #=> 13
198
+ # str #=> "I AM THE DATA"
199
+ #===Remarks
200
+ #The block form is *much* better on memory usage, because it doesn't have
201
+ #to load everything into RAM at once. If you don't know how big your
202
+ #data gets or if you want to decompress much data, use the block form. Of
203
+ #course you shouldn't store the data your read in RAM then as in the
204
+ #example above.
205
+ def decompress_stream(io, memory_limit = LibLZMA::UINT64_MAX, flags = [:tell_unsupported_check], &block)
206
+ raise(ArgumentError, "Invalid memory limit set!") unless (0..LibLZMA::UINT64_MAX).include?(memory_limit)
207
+ flags.each do |flag|
208
+ raise(ArgumentError, "Unknown flag #{flag}!") unless [:tell_no_check, :tell_unsupported_check, :tell_any_check, :concatenated].include?(flag)
209
+ end
210
+
211
+ stream = LZMAStream.new
212
+ res = LibLZMA.lzma_stream_decoder(
213
+ stream.pointer,
214
+ memory_limit,
215
+ flags.inject(0){|val, flag| val | LibLZMA.const_get(:"LZMA_#{flag.to_s.upcase}")}
216
+ )
217
+
218
+ LZMAError.raise_if_necessary(res)
219
+
220
+ res = ""
221
+ if block_given?
222
+ res = lzma_code(io, stream, &block)
223
+ else
224
+ lzma_code(io, stream){|chunk| res << chunk}
225
+ end
226
+
227
+ LibLZMA.lzma_end(stream.pointer)
228
+
229
+ block_given? ? stream[:total_out] : res
230
+ end
231
+ alias decode_stream decompress_stream
232
+
233
+ #call-seq:
234
+ # compress_stream(io [, compression_level [, check [, extreme ] ] ] ) → a_string
235
+ # compress_stream(io [, compression_level [, check [, extreme ] ] ] ){|chunk| ... } → an_integer
236
+ # encode_stream(io [, compression_level [, check [, extreme ] ] ] ) → a_string
237
+ # encode_stream(io [, compression_level [, check [, extreme ] ] ] ){|chunk| ... } → an_integer
238
+ #
239
+ #Compresses a stream of data into XZ-compressed data.
240
+ #===Parameters
241
+ #[io] The IO to read the data from. Must be opened for
242
+ # reading.
243
+ #[compression_level] (6) Compression strength. Higher values indicate a
244
+ # smaller result, but longer compression time. Maximum
245
+ # is 9.
246
+ #[check] (:crc64) The checksum algorithm to use for verifying
247
+ # the data inside the archive. Possible values are:
248
+ # * :none
249
+ # * :crc32
250
+ # * :crc64
251
+ # * :sha256
252
+ #[extreme] (false) Tries to get the last bit out of the
253
+ # compression. This may succeed, but you can end
254
+ # up with *very* long computation times.
255
+ #[chunk] (Block argument) One piece of compressed data.
256
+ #===Return value
257
+ #If a block was given, returns the number of bytes written. Otherwise,
258
+ #returns the compressed data as a BINARY-encoded string.
259
+ #===Example
260
+ # data = File.read("file.txt")
261
+ # i = StringIO.new(data)
262
+ # XZ.compress_stream(i) #=> Some binary blob
263
+ # i.rewind
264
+ # str = ""
265
+ # XZ.compress_stream(i, 4, :sha256){|c| str << c} #=> 123
266
+ # str #=> Some binary blob
267
+ #===Remarks
268
+ #The block form is *much* better on memory usage, because it doesn't have
269
+ #to load everything into RAM at once. If you don't know how big your
270
+ #data gets or if you want to compress much data, use the block form. Of
271
+ #course you shouldn't store the data your read in RAM then as in the
272
+ #example above.
273
+ def compress_stream(io, compression_level = 6, check = :crc64, extreme = false, &block)
274
+ raise(ArgumentError, "Invalid compression level!") unless (0..9).include?(compression_level)
275
+ raise(ArgumentError, "Invalid checksum specified!") unless [:none, :crc32, :crc64, :sha256].include?(check)
276
+
277
+ stream = LZMAStream.new
278
+ res = LibLZMA.lzma_easy_encoder(
279
+ stream.pointer,
280
+ compression_level | (extreme ? LibLZMA::LZMA_PRESET_EXTREME : 0),
281
+ LibLZMA::LZMA_CHECK[:"lzma_check_#{check}"]
282
+ )
283
+
284
+ LZMAError.raise_if_necessary(res)
285
+
286
+ res = ""
287
+ if block_given?
288
+ res = lzma_code(io, stream, &block)
289
+ else
290
+ lzma_code(io, stream){|chunk| res << chunk}
291
+ end
292
+
293
+ LibLZMA.lzma_end(stream.pointer)
294
+
295
+ block_given? ? stream[:total_out] : res
296
+ end
297
+ alias encode_stream compress_stream
298
+
299
+ #Compresses +in_file+ and writes the result to +out_file+.
300
+ #===Parameters
301
+ #[in_file] The path to the file to read from.
302
+ #[out_file] The path of the file to write to. If it exists, it will be
303
+ # overwritten.
304
+ #For the other parameters, see the compress_stream method.
305
+ #===Return value
306
+ #The number of bytes written, i.e. the size of the archive.
307
+ #===Example
308
+ # XZ.compress("myfile.txt", "myfile.txt.xz")
309
+ # XZ.compress("myarchive.tar", "myarchive.tar.xz")
310
+ #===Remarks
311
+ #This method is safe to use with big files, because files are not loaded
312
+ #into memory completely at once.
313
+ def compress_file(in_file, out_file, compression_level = 6, check = :crc64, extreme = false)
314
+ File.open(in_file, "rb") do |i_file|
315
+ File.open(out_file, "wb") do |o_file|
316
+ compress_stream(i_file, compression_level, check, extreme) do |chunk|
317
+ o_file.write(chunk)
318
+ end
319
+ end
320
+ end
321
+ end
322
+
323
+ #Compresses arbitrary data using the XZ algorithm.
324
+ #===Parameters
325
+ #[str] The data to compress.
326
+ #For the other parameters, see the compress_stream method.
327
+ #===Return value
328
+ #The compressed data as a BINARY-encoded string.
329
+ #===Example
330
+ # data = "I love Ruby"
331
+ # comp = XZ.compress(data) #=> binary blob
332
+ #===Remarks
333
+ #Don't use this method for big amounts of data--you may run out of
334
+ #memory. Use compress_file or compress_stream instead.
335
+ def compress(str, compression_level = 6, check = :crc64, extreme = false)
336
+ raise(NotImplementedError, "StringIO isn't available!") unless defined? StringIO
337
+ s = StringIO.new(str)
338
+ compress_stream(s, compression_level, check, extreme)
339
+ end
340
+
341
+ #Decompresses data in XZ format.
342
+ #===Parameters
343
+ #[str] The data to decompress.
344
+ #For the other parameters, see the decompress_stream method.
345
+ #===Return value
346
+ #The decompressed data as a BINARY-encoded string.
347
+ #===Example
348
+ # comp = File.open("data.xz", "rb"){|f| f.read}
349
+ # data = XZ.decompress(comp) #=> "I love Ruby"
350
+ #===Remarks
351
+ #Don't use this method for big amounts of data--you may run out of
352
+ #memory. Use decompress_file or decompress_stream instead.
353
+ def decompress(str, memory_limit = LibLZMA::UINT64_MAX, flags = [:tell_unsupported_check])
354
+ raise(NotImplementedError, "StringIO isn't available!") unless defined? StringIO
355
+ s = StringIO.new(str)
356
+ decompress_stream(s, memory_limit, flags)
357
+ end
358
+
359
+ #Decompresses +in_file+ and writes the result to +out_file+.
360
+ #===Parameters
361
+ #[in_file] The path to the file to read from.
362
+ #[out_file] The path of the file to write to. If it exists, it will
363
+ # be overwritten.
364
+ #For the other parameters, see the decompress_stream method.
365
+ #===Return value
366
+ #The number of bytes written, i.e. the size of the uncompressed data.
367
+ #===Example
368
+ # XZ.decompres("myfile.txt.xz", "myfile.txt")
369
+ # XZ.decompress("myarchive.tar.xz", "myarchive.tar")
370
+ #===Remarks
371
+ #This method is safe to use with big files, because files are not loaded
372
+ #into memory completely at once.
373
+ def decompress_file(in_file, out_file, memory_limit = LibLZMA::UINT64_MAX, flags = [:tell_unsupported_check])
374
+ File.open(in_file, "rb") do |i_file|
375
+ File.open(out_file, "wb") do |o_file|
376
+ decompress_stream(i_file, memory_limit, flags) do |chunk|
377
+ o_file.write(chunk)
378
+ end
379
+ end
380
+ end
381
+ end
382
+
383
+ private
384
+
385
+ #This method returns the size of +str+ in bytes.
386
+ def binary_size(str)
387
+ #Believe it or not, but this is faster than str.bytes.to_a.size.
388
+ #I benchmarked it, and it is as twice as fast.
389
+ if str.respond_to? :force_encoding
390
+ str.dup.force_encoding("BINARY").size
391
+ else
392
+ str.bytes.to_a.size
393
+ end
394
+ end
395
+
396
+ #This method does the heavy work of (de-)compressing a stream. It takes
397
+ #an IO object to read data from (that means the IO must be opened
398
+ #for reading) and a XZ::LZMAStream object that is used to (de-)compress
399
+ #the data. Furthermore this method takes a block which gets passed
400
+ #the (de-)compressed data in chunks one at a time--this is needed to allow
401
+ #(de-)compressing of very large files that can't be loaded fully into
402
+ #memory.
403
+ def lzma_code(io, stream)
404
+ input_buffer_p = FFI::MemoryPointer.new(CHUNK_SIZE)
405
+ output_buffer_p = FFI::MemoryPointer.new(CHUNK_SIZE)
406
+
407
+ while str = io.read(CHUNK_SIZE)
408
+ input_buffer_p.write_string(str)
409
+
410
+ #Set the data for compressing
411
+ stream[:next_in] = input_buffer_p
412
+ stream[:avail_in] = binary_size(str)
413
+
414
+ #Now loop until we gathered all the data in stream[:next_out]. Depending on the
415
+ #amount of data, this may not fit into the buffer, meaning that we have to
416
+ #provide a pointer to a "new" buffer that liblzma can write into. Since
417
+ #liblzma already set stream[:avail_in] to 0 in the first iteration, the extra call to the
418
+ #lzma_code() function doesn't hurt (indeed the pipe_comp example from
419
+ #liblzma handles it this way too). Sometimes it happens that the compressed data
420
+ #is bigger than the original (notably when the amount of data to compress
421
+ #is small)
422
+ loop do
423
+ #Prepare for getting the compressed_data
424
+ stream[:next_out] = output_buffer_p
425
+ stream[:avail_out] = CHUNK_SIZE
426
+
427
+ #Compress the data
428
+ res = if io.eof?
429
+ LibLZMA.lzma_code(stream.pointer, LibLZMA::LZMA_ACTION[:lzma_finish])
430
+ else
431
+ LibLZMA.lzma_code(stream.pointer, LibLZMA::LZMA_ACTION[:lzma_run])
432
+ end
433
+ check_lzma_code_retval(res)
434
+
435
+ #Write the compressed data
436
+ data = output_buffer_p.read_string(CHUNK_SIZE - stream[:avail_out])
437
+ yield(data)
438
+
439
+ #If the buffer is completely filled, it's likely that there is
440
+ #more data liblzma wants to hand to us. Start a new iteration,
441
+ #but don't provide new input data.
442
+ break unless stream[:avail_out] == 0
443
+ end #loop
444
+ end #while
445
+ end #lzma_code
446
+
447
+ #Checks for errors and warnings that can be derived from the return
448
+ #value of the lzma_code() function and shows them if necessary.
449
+ def check_lzma_code_retval(code)
450
+ e = LibLZMA::LZMA_RET
451
+ case code
452
+ when e[:lzma_no_check] then warn("Couldn't verify archive integrity--archive has not integrity checksum.")
453
+ when e[:lzma_unsupported_check] then warn("Couldn't verify archive integrity--archive has an unsupported integrity checksum.")
454
+ when e[:lzma_get_check] then nil #This isn't useful for us. It indicates that the checksum type is now known.
455
+ else
456
+ LZMAError.raise_if_necessary(code)
457
+ end
458
+ end
459
+
460
+ end #class << self
461
+
462
+ end