zim-ruby 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ # About
2
+
3
+ zim-ruby is a ruby library to read openzim (http://openzim.org) files.
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH << '../lib'
4
+
5
+ require 'zim'
6
+
7
+ if ARGV.size != 2
8
+ puts "unpack.rb <zimfile> <directory>"
9
+ exit 1
10
+ end
11
+
12
+ out = ARGV[1]
13
+
14
+ f = Zim::ZimFile.new(ARGV[0])
15
+ f.urls.each do |url|
16
+ puts url
17
+ Dir.mkdir("#{out}/#{url.namespace}") unless File.directory? "#{out}/#{url.namespace}"
18
+ File.new("#{out}#{url.to_s}", 'w').write(url.blob)
19
+ end
20
+
@@ -0,0 +1,462 @@
1
+ #Encoding: UTF-8
2
+ =begin (The MIT License)
3
+
4
+ Basic liblzma-bindings for Ruby.
5
+
6
+ Copyright © 2011 Marvin Gülker
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a
9
+ copy of this software and associated documentation files (the ‘Software’),
10
+ to deal in the Software without restriction, including without limitation
11
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
+ and/or sell copies of the Software, and to permit persons to whom the Software
13
+ is furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ THE SOFTWARE.
25
+ =end
26
+
27
+ require "ffi"
28
+ require 'stringio'
29
+
30
+ #The namespace and main module of this library. Each method of this module
31
+ #may raise exceptions of class XZ::LZMAError, which is not named in the
32
+ #methods' documentations anymore.
33
+ module XZ
34
+
35
+ #This module wraps functions and enums used by liblzma.
36
+ module LibLZMA
37
+ extend FFI::Library
38
+
39
+ #The maximum value of an uint64_t, as defined by liblzma.
40
+ #Should be the same as
41
+ # (2 ** 64) - 1
42
+ UINT64_MAX = 18446744073709551615
43
+
44
+ #Activates extreme compression. Same as xz's "-e" commandline switch.
45
+ LZMA_PRESET_EXTREME = 1 << 31
46
+
47
+ LZMA_TELL_NO_CHECK = 0x02
48
+ LZMA_TELL_UNSUPPORTED_CHECK = 0x02
49
+ LZMA_TELL_ANY_CHECK = 0x04
50
+ LZMA_CONCATENATED = 0x08
51
+
52
+ #Placeholder enum used by liblzma for later additions.
53
+ LZMA_RESERVED_ENUM = enum :lzma_reserved_enum, 0
54
+
55
+ #Actions that can be passed to the lzma_code() function.
56
+ LZMA_ACTION = enum :lzma_run, 0,
57
+ :lzma_sync_flush,
58
+ :lzma_full_flush,
59
+ :lzma_finish
60
+
61
+ #Integrity check algorithms supported by liblzma.
62
+ LZMA_CHECK = enum :lzma_check_none, 0,
63
+ :lzma_check_crc32, 1,
64
+ :lzma_check_crc64, 4,
65
+ :lzma_check_sha256, 10
66
+
67
+ #Possible return values of liblzma functions.
68
+ LZMA_RET = enum :lzma_ok, 0,
69
+ :lzma_stream_end,
70
+ :lzma_no_check,
71
+ :lzma_unsupported_check,
72
+ :lzma_get_check,
73
+ :lzma_mem_error,
74
+ :lzma_memlimit_error,
75
+ :lzma_format_error,
76
+ :lzma_options_error,
77
+ :lzma_data_error,
78
+ :lzma_buf_error,
79
+ :lzma_prog_error
80
+
81
+ ffi_lib ['lzma.so.2', 'lzma.so', 'lzma']
82
+
83
+ attach_function :lzma_easy_encoder, [:pointer, :uint32, :int], :int
84
+ attach_function :lzma_code, [:pointer, :int], :int
85
+ attach_function :lzma_stream_decoder, [:pointer, :uint64, :uint32], :int
86
+ attach_function :lzma_end, [:pointer], :void
87
+
88
+ end
89
+
90
+ #The class of the error that this library raises.
91
+ class LZMAError < StandardError
92
+
93
+ #Raises an appropriate exception if +val+ isn't a liblzma success code.
94
+ def self.raise_if_necessary(val)
95
+ case val
96
+ when :lzma_mem_error then raise(self, "Couldn't allocate memory!")
97
+ when :lzma_memlimit_error then raise(self, "Decoder ran out of (allowed) memory!")
98
+ when :lzma_format_error then raise(self, "Unrecognized file format!")
99
+ when :lzma_options_error then raise(self, "Invalid options passed!")
100
+ when :lzma_data_error then raise raise(self, "Archive is currupt.")
101
+ when :lzma_buf_error then raise(self, "Buffer unusable!")
102
+ when :lzma_prog_error then raise(self, "Program error--if you're sure your code is correct, you may have found a bug in liblzma.")
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ #The main struct of the liblzma library.
109
+ class LZMAStream < FFI::Struct
110
+ layout :next_in, :pointer, #uint8
111
+ :avail_in, :size_t,
112
+ :total_in, :uint64,
113
+ :next_out, :pointer, #uint8
114
+ :avail_out, :size_t,
115
+ :total_out, :uint64,
116
+ :lzma_allocator, :pointer,
117
+ :lzma_internal, :pointer,
118
+ :reserved_ptr1, :pointer,
119
+ :reserved_ptr2, :pointer,
120
+ :reserved_ptr3, :pointer,
121
+ :reserved_ptr4, :pointer,
122
+ :reserved_int1, :uint64,
123
+ :reserved_int2, :uint64,
124
+ :reserved_int3, :size_t,
125
+ :reserved_int4, :size_t,
126
+ :reserved_enum1, :int,
127
+ :reserved_enum2, :int
128
+
129
+ #This method does basicly the same thing as the
130
+ #LZMA_STREAM_INIT macro of liblzma. Creates a new LZMAStream
131
+ #that has been initialized for usage. If any argument is passed,
132
+ #it is assumed to be a FFI::Pointer to a lzma_stream structure
133
+ #and that structure is wrapped.
134
+ def initialize(*args)
135
+ if args.empty? #Got a pointer, want to wrap it
136
+ super
137
+ else
138
+ s = super()
139
+ s[:next] = nil
140
+ s[:avail_in] = 0
141
+ s[:total_in] = 0
142
+ s[:next_out] = nil
143
+ s[:avail_out] = 0
144
+ s[:total_out] = 0
145
+ s[:lzma_allocator] = nil
146
+ s[:lzma_internal] = nil
147
+ s[:reserved_ptr1] = nil
148
+ s[:reserved_ptr2] = nil
149
+ s[:reserved_ptr3] = nil
150
+ s[:reserved_ptr4] = nil
151
+ s[:reserved_int1] = 0
152
+ s[:reserved_int2] = 0
153
+ s[:reserved_int3] = 0
154
+ s[:reserved_int4] = 0
155
+ s[:reserved_enum1] = LibLZMA::LZMA_RESERVED_ENUM[:lzma_reserved_enum]
156
+ s[:reserved_enum2] = LibLZMA::LZMA_RESERVED_ENUM[:lzma_reserved_enum]
157
+ s
158
+ end
159
+ end
160
+ end
161
+
162
+ #Number of bytes read in one chunk.
163
+ CHUNK_SIZE = 4096
164
+ #The version of this library.
165
+ VERSION = "0.0.2".freeze
166
+
167
+ class << self
168
+
169
+ #call-seq:
170
+ # decompress_stream(io [, memory_limit [, flags ] ] ) → a_string
171
+ # decompress_stream(io [, memory_limit [, flags ] ] ){|chunk| ... } → an_integer
172
+ # decode_stream(io [, memory_limit [, flags ] ] ) → a_string
173
+ # decode_stream(io [, memory_limit [, flags ] ] ){|chunk| ... } → an_integer
174
+ #
175
+ #Decompresses a stream containing XZ-compressed data.
176
+ #===Parameters
177
+ #[io] The IO to read from. It must be opened for reading.
178
+ #[memory_limit] (+UINT64_MAX+) If not XZ::LibLZMA::UINT64_MAX, makes liblzma
179
+ # use no more memory than +memory_limit+ bytes.
180
+ #[flags] (<tt>[:tell_unsupported_check]</tt>) Additional flags
181
+ # passed to liblzma (an array). Possible flags are:
182
+ # [:tell_no_check] Spit out a warning if the archive hasn't an
183
+ # itnegrity checksum.
184
+ # [:tell_unsupported_check] Spit out a warning if the archive
185
+ # has an unsupported checksum type.
186
+ # [:concatenated] Decompress concatenated archives.
187
+ #[chunk] (Block argument) One piece of decompressed data.
188
+ #===Return value
189
+ #If a block was given, returns the number of bytes written. Otherwise,
190
+ #returns the decompressed data as a BINARY-encoded string.
191
+ #===Example
192
+ # data = File.open("archive.xz", "rb"){|f| f.read}
193
+ # io = StringIO.new(data)
194
+ # XZ.decompress_stream(io) #=> "I AM THE DATA"
195
+ # io.rewind
196
+ # str = ""
197
+ # XZ.decompress_stream(io, XZ::LibLZMA::UINT64_MAX, [:tell_no_check]){|c| str << c} #=> 13
198
+ # str #=> "I AM THE DATA"
199
+ #===Remarks
200
+ #The block form is *much* better on memory usage, because it doesn't have
201
+ #to load everything into RAM at once. If you don't know how big your
202
+ #data gets or if you want to decompress much data, use the block form. Of
203
+ #course you shouldn't store the data your read in RAM then as in the
204
+ #example above.
205
+ def decompress_stream(io, memory_limit = LibLZMA::UINT64_MAX, flags = [:tell_unsupported_check], &block)
206
+ raise(ArgumentError, "Invalid memory limit set!") unless (0..LibLZMA::UINT64_MAX).include?(memory_limit)
207
+ flags.each do |flag|
208
+ raise(ArgumentError, "Unknown flag #{flag}!") unless [:tell_no_check, :tell_unsupported_check, :tell_any_check, :concatenated].include?(flag)
209
+ end
210
+
211
+ stream = LZMAStream.new
212
+ res = LibLZMA.lzma_stream_decoder(
213
+ stream.pointer,
214
+ memory_limit,
215
+ flags.inject(0){|val, flag| val | LibLZMA.const_get(:"LZMA_#{flag.to_s.upcase}")}
216
+ )
217
+
218
+ LZMAError.raise_if_necessary(res)
219
+
220
+ res = ""
221
+ if block_given?
222
+ res = lzma_code(io, stream, &block)
223
+ else
224
+ lzma_code(io, stream){|chunk| res << chunk}
225
+ end
226
+
227
+ LibLZMA.lzma_end(stream.pointer)
228
+
229
+ block_given? ? stream[:total_out] : res
230
+ end
231
+ alias decode_stream decompress_stream
232
+
233
+ #call-seq:
234
+ # compress_stream(io [, compression_level [, check [, extreme ] ] ] ) → a_string
235
+ # compress_stream(io [, compression_level [, check [, extreme ] ] ] ){|chunk| ... } → an_integer
236
+ # encode_stream(io [, compression_level [, check [, extreme ] ] ] ) → a_string
237
+ # encode_stream(io [, compression_level [, check [, extreme ] ] ] ){|chunk| ... } → an_integer
238
+ #
239
+ #Compresses a stream of data into XZ-compressed data.
240
+ #===Parameters
241
+ #[io] The IO to read the data from. Must be opened for
242
+ # reading.
243
+ #[compression_level] (6) Compression strength. Higher values indicate a
244
+ # smaller result, but longer compression time. Maximum
245
+ # is 9.
246
+ #[check] (:crc64) The checksum algorithm to use for verifying
247
+ # the data inside the archive. Possible values are:
248
+ # * :none
249
+ # * :crc32
250
+ # * :crc64
251
+ # * :sha256
252
+ #[extreme] (false) Tries to get the last bit out of the
253
+ # compression. This may succeed, but you can end
254
+ # up with *very* long computation times.
255
+ #[chunk] (Block argument) One piece of compressed data.
256
+ #===Return value
257
+ #If a block was given, returns the number of bytes written. Otherwise,
258
+ #returns the compressed data as a BINARY-encoded string.
259
+ #===Example
260
+ # data = File.read("file.txt")
261
+ # i = StringIO.new(data)
262
+ # XZ.compress_stream(i) #=> Some binary blob
263
+ # i.rewind
264
+ # str = ""
265
+ # XZ.compress_stream(i, 4, :sha256){|c| str << c} #=> 123
266
+ # str #=> Some binary blob
267
+ #===Remarks
268
+ #The block form is *much* better on memory usage, because it doesn't have
269
+ #to load everything into RAM at once. If you don't know how big your
270
+ #data gets or if you want to compress much data, use the block form. Of
271
+ #course you shouldn't store the data your read in RAM then as in the
272
+ #example above.
273
+ def compress_stream(io, compression_level = 6, check = :crc64, extreme = false, &block)
274
+ raise(ArgumentError, "Invalid compression level!") unless (0..9).include?(compression_level)
275
+ raise(ArgumentError, "Invalid checksum specified!") unless [:none, :crc32, :crc64, :sha256].include?(check)
276
+
277
+ stream = LZMAStream.new
278
+ res = LibLZMA.lzma_easy_encoder(
279
+ stream.pointer,
280
+ compression_level | (extreme ? LibLZMA::LZMA_PRESET_EXTREME : 0),
281
+ LibLZMA::LZMA_CHECK[:"lzma_check_#{check}"]
282
+ )
283
+
284
+ LZMAError.raise_if_necessary(res)
285
+
286
+ res = ""
287
+ if block_given?
288
+ res = lzma_code(io, stream, &block)
289
+ else
290
+ lzma_code(io, stream){|chunk| res << chunk}
291
+ end
292
+
293
+ LibLZMA.lzma_end(stream.pointer)
294
+
295
+ block_given? ? stream[:total_out] : res
296
+ end
297
+ alias encode_stream compress_stream
298
+
299
+ #Compresses +in_file+ and writes the result to +out_file+.
300
+ #===Parameters
301
+ #[in_file] The path to the file to read from.
302
+ #[out_file] The path of the file to write to. If it exists, it will be
303
+ # overwritten.
304
+ #For the other parameters, see the compress_stream method.
305
+ #===Return value
306
+ #The number of bytes written, i.e. the size of the archive.
307
+ #===Example
308
+ # XZ.compress("myfile.txt", "myfile.txt.xz")
309
+ # XZ.compress("myarchive.tar", "myarchive.tar.xz")
310
+ #===Remarks
311
+ #This method is safe to use with big files, because files are not loaded
312
+ #into memory completely at once.
313
+ def compress_file(in_file, out_file, compression_level = 6, check = :crc64, extreme = false)
314
+ File.open(in_file, "rb") do |i_file|
315
+ File.open(out_file, "wb") do |o_file|
316
+ compress_stream(i_file, compression_level, check, extreme) do |chunk|
317
+ o_file.write(chunk)
318
+ end
319
+ end
320
+ end
321
+ end
322
+
323
+ #Compresses arbitrary data using the XZ algorithm.
324
+ #===Parameters
325
+ #[str] The data to compress.
326
+ #For the other parameters, see the compress_stream method.
327
+ #===Return value
328
+ #The compressed data as a BINARY-encoded string.
329
+ #===Example
330
+ # data = "I love Ruby"
331
+ # comp = XZ.compress(data) #=> binary blob
332
+ #===Remarks
333
+ #Don't use this method for big amounts of data--you may run out of
334
+ #memory. Use compress_file or compress_stream instead.
335
+ def compress(str, compression_level = 6, check = :crc64, extreme = false)
336
+ raise(NotImplementedError, "StringIO isn't available!") unless defined? StringIO
337
+ s = StringIO.new(str)
338
+ compress_stream(s, compression_level, check, extreme)
339
+ end
340
+
341
+ #Decompresses data in XZ format.
342
+ #===Parameters
343
+ #[str] The data to decompress.
344
+ #For the other parameters, see the decompress_stream method.
345
+ #===Return value
346
+ #The decompressed data as a BINARY-encoded string.
347
+ #===Example
348
+ # comp = File.open("data.xz", "rb"){|f| f.read}
349
+ # data = XZ.decompress(comp) #=> "I love Ruby"
350
+ #===Remarks
351
+ #Don't use this method for big amounts of data--you may run out of
352
+ #memory. Use decompress_file or decompress_stream instead.
353
+ def decompress(str, memory_limit = LibLZMA::UINT64_MAX, flags = [:tell_unsupported_check])
354
+ raise(NotImplementedError, "StringIO isn't available!") unless defined? StringIO
355
+ s = StringIO.new(str)
356
+ decompress_stream(s, memory_limit, flags)
357
+ end
358
+
359
+ #Decompresses +in_file+ and writes the result to +out_file+.
360
+ #===Parameters
361
+ #[in_file] The path to the file to read from.
362
+ #[out_file] The path of the file to write to. If it exists, it will
363
+ # be overwritten.
364
+ #For the other parameters, see the decompress_stream method.
365
+ #===Return value
366
+ #The number of bytes written, i.e. the size of the uncompressed data.
367
+ #===Example
368
+ # XZ.decompres("myfile.txt.xz", "myfile.txt")
369
+ # XZ.decompress("myarchive.tar.xz", "myarchive.tar")
370
+ #===Remarks
371
+ #This method is safe to use with big files, because files are not loaded
372
+ #into memory completely at once.
373
+ def decompress_file(in_file, out_file, memory_limit = LibLZMA::UINT64_MAX, flags = [:tell_unsupported_check])
374
+ File.open(in_file, "rb") do |i_file|
375
+ File.open(out_file, "wb") do |o_file|
376
+ decompress_stream(i_file, memory_limit, flags) do |chunk|
377
+ o_file.write(chunk)
378
+ end
379
+ end
380
+ end
381
+ end
382
+
383
+ private
384
+
385
+ #This method returns the size of +str+ in bytes.
386
+ def binary_size(str)
387
+ #Believe it or not, but this is faster than str.bytes.to_a.size.
388
+ #I benchmarked it, and it is as twice as fast.
389
+ if str.respond_to? :force_encoding
390
+ str.dup.force_encoding("BINARY").size
391
+ else
392
+ str.bytes.to_a.size
393
+ end
394
+ end
395
+
396
+ #This method does the heavy work of (de-)compressing a stream. It takes
397
+ #an IO object to read data from (that means the IO must be opened
398
+ #for reading) and a XZ::LZMAStream object that is used to (de-)compress
399
+ #the data. Furthermore this method takes a block which gets passed
400
+ #the (de-)compressed data in chunks one at a time--this is needed to allow
401
+ #(de-)compressing of very large files that can't be loaded fully into
402
+ #memory.
403
+ def lzma_code(io, stream)
404
+ input_buffer_p = FFI::MemoryPointer.new(CHUNK_SIZE)
405
+ output_buffer_p = FFI::MemoryPointer.new(CHUNK_SIZE)
406
+
407
+ while str = io.read(CHUNK_SIZE)
408
+ input_buffer_p.write_string(str)
409
+
410
+ #Set the data for compressing
411
+ stream[:next_in] = input_buffer_p
412
+ stream[:avail_in] = binary_size(str)
413
+
414
+ #Now loop until we gathered all the data in stream[:next_out]. Depending on the
415
+ #amount of data, this may not fit into the buffer, meaning that we have to
416
+ #provide a pointer to a "new" buffer that liblzma can write into. Since
417
+ #liblzma already set stream[:avail_in] to 0 in the first iteration, the extra call to the
418
+ #lzma_code() function doesn't hurt (indeed the pipe_comp example from
419
+ #liblzma handles it this way too). Sometimes it happens that the compressed data
420
+ #is bigger than the original (notably when the amount of data to compress
421
+ #is small)
422
+ loop do
423
+ #Prepare for getting the compressed_data
424
+ stream[:next_out] = output_buffer_p
425
+ stream[:avail_out] = CHUNK_SIZE
426
+
427
+ #Compress the data
428
+ res = if io.eof?
429
+ LibLZMA.lzma_code(stream.pointer, LibLZMA::LZMA_ACTION[:lzma_finish])
430
+ else
431
+ LibLZMA.lzma_code(stream.pointer, LibLZMA::LZMA_ACTION[:lzma_run])
432
+ end
433
+ check_lzma_code_retval(res)
434
+
435
+ #Write the compressed data
436
+ data = output_buffer_p.read_string(CHUNK_SIZE - stream[:avail_out])
437
+ yield(data)
438
+
439
+ #If the buffer is completely filled, it's likely that there is
440
+ #more data liblzma wants to hand to us. Start a new iteration,
441
+ #but don't provide new input data.
442
+ break unless stream[:avail_out] == 0
443
+ end #loop
444
+ end #while
445
+ end #lzma_code
446
+
447
+ #Checks for errors and warnings that can be derived from the return
448
+ #value of the lzma_code() function and shows them if necessary.
449
+ def check_lzma_code_retval(code)
450
+ e = LibLZMA::LZMA_RET
451
+ case code
452
+ when e[:lzma_no_check] then warn("Couldn't verify archive integrity--archive has not integrity checksum.")
453
+ when e[:lzma_unsupported_check] then warn("Couldn't verify archive integrity--archive has an unsupported integrity checksum.")
454
+ when e[:lzma_get_check] then nil #This isn't useful for us. It indicates that the checksum type is now known.
455
+ else
456
+ LZMAError.raise_if_necessary(code)
457
+ end
458
+ end
459
+
460
+ end #class << self
461
+
462
+ end