zim-ruby 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/LICENSE +674 -0
- data/README.markdown +3 -0
- data/examples/unpack.rb +20 -0
- data/lib/xz.rb +462 -0
- data/lib/zim.rb +18 -0
- data/lib/zim/cluster.rb +61 -0
- data/lib/zim/exceptions.rb +15 -0
- data/lib/zim/file.rb +122 -0
- data/lib/zim/structs.rb +84 -0
- data/lib/zim/url.rb +67 -0
- data/zim-ruby.gemspec +19 -0
- metadata +77 -0
data/lib/zim.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'stringio'
|
3
|
+
require 'xz'
|
4
|
+
|
5
|
+
require 'zim/file'
|
6
|
+
require 'zim/structs'
|
7
|
+
require 'zim/cluster'
|
8
|
+
require 'zim/exceptions'
|
9
|
+
require 'zim/url'
|
10
|
+
|
11
|
+
# Zim module
|
12
|
+
#
|
13
|
+
# Example:
|
14
|
+
# zim = Zim::Zimfile.new('test.zim')
|
15
|
+
# p zim.urls.first.blob
|
16
|
+
module Zim
|
17
|
+
end
|
18
|
+
|
data/lib/zim/cluster.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# entry in the cluster list
|
4
|
+
class ClusterEntry
|
5
|
+
|
6
|
+
# compression of the cluster
|
7
|
+
attr_reader :compression
|
8
|
+
|
9
|
+
# read a cluster from the file
|
10
|
+
def initialize(f)
|
11
|
+
@compression = f.read_int8
|
12
|
+
|
13
|
+
@f = f
|
14
|
+
@pos = f.tell
|
15
|
+
@offsets = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# read the blob +blob+ from the cluster
|
19
|
+
def read_blob(blob)
|
20
|
+
case @compression
|
21
|
+
when 0, 1
|
22
|
+
read_blob_uncompressed(nil, blob)
|
23
|
+
when 4
|
24
|
+
read_blob_lzma(blob)
|
25
|
+
else
|
26
|
+
raise UnknownCompression
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def read_blob_uncompressed(io, blob, seek = true) # :nodoc:
|
31
|
+
if @offsets.nil?
|
32
|
+
@offsets = Array.new
|
33
|
+
|
34
|
+
@f.seek(@pos, io) if seek
|
35
|
+
|
36
|
+
off = @f.read_int32(nil, io)
|
37
|
+
count = (off >> 2) - 1
|
38
|
+
|
39
|
+
@offsets << off
|
40
|
+
count.times do
|
41
|
+
@offsets << @f.read_int32(nil, io)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
raise InvalidBlobNumber if (blob < 0) || (blob >= (@offsets.count - 1))
|
46
|
+
|
47
|
+
off = @offsets[blob]
|
48
|
+
next_off = @offsets[blob + 1]
|
49
|
+
len = next_off - off
|
50
|
+
off += @pos if seek
|
51
|
+
|
52
|
+
@f.read_str(len, off, io)
|
53
|
+
end
|
54
|
+
|
55
|
+
def read_blob_lzma(blob) # :nodoc:
|
56
|
+
@f.seek(@pos)
|
57
|
+
io = StringIO.new(XZ.decompress_stream(@f.file))
|
58
|
+
read_blob_uncompressed(io, blob, false)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# file contains an invalid magic
|
4
|
+
class InvalidZimMagic < Exception
|
5
|
+
end
|
6
|
+
|
7
|
+
# the cluster compression is not supported
|
8
|
+
class UnknownCompression < Exception
|
9
|
+
end
|
10
|
+
|
11
|
+
# the blob number is out of range
|
12
|
+
class InvalidBlobNumber < Exception
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
data/lib/zim/file.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# ZIM magic number
|
4
|
+
ZIM_MAGIC = 72173914
|
5
|
+
|
6
|
+
# Main class of the zim-ruby library
|
7
|
+
class ZimFile
|
8
|
+
# header informations (used internally)
|
9
|
+
attr_reader :header
|
10
|
+
|
11
|
+
# list of included mime types (used internally)
|
12
|
+
attr_reader :mime_types
|
13
|
+
|
14
|
+
# Directory of urls (used internally)
|
15
|
+
attr_reader :urls
|
16
|
+
|
17
|
+
# mapping of titles to urls (used internally)
|
18
|
+
attr_reader :titles
|
19
|
+
|
20
|
+
# cluster list (used internally)
|
21
|
+
attr_reader :clusters
|
22
|
+
|
23
|
+
# IO associated with this ZimFile
|
24
|
+
attr_reader :file
|
25
|
+
|
26
|
+
# load data from the given file
|
27
|
+
def initialize(filename)
|
28
|
+
@file = File.new(filename)
|
29
|
+
|
30
|
+
@header = FileHeader.new(self)
|
31
|
+
raise InvalidZimMagic if(@header.magic != ZIM_MAGIC)
|
32
|
+
|
33
|
+
seek(@header.mime_list_pos)
|
34
|
+
@mime_types = StringList.new(self)
|
35
|
+
|
36
|
+
seek(@header.url_pos)
|
37
|
+
@urls = Directory.new(self, @header.article_count, Url)
|
38
|
+
|
39
|
+
seek(@header.title_pos)
|
40
|
+
@titles = TitleList.new(self, @header.article_count)
|
41
|
+
|
42
|
+
seek(@header.cluster_pos)
|
43
|
+
@clusters = Directory.new(self, @header.cluster_count, ClusterEntry)
|
44
|
+
end
|
45
|
+
|
46
|
+
# access a url by full url (including namespace)
|
47
|
+
# e.g.: zim['/A/Table of Contents']
|
48
|
+
def [](idx)
|
49
|
+
@urls.detect { |x| x.to_s == idx }
|
50
|
+
end
|
51
|
+
|
52
|
+
def seek(pos, io = nil) # :nodoc:
|
53
|
+
io ||= @file
|
54
|
+
io.seek(pos)
|
55
|
+
end
|
56
|
+
|
57
|
+
def tell(io = nil) # :nodoc:
|
58
|
+
io ||= @file
|
59
|
+
io.tell
|
60
|
+
end
|
61
|
+
|
62
|
+
def read_int8(pos = nil, io = nil) # :nodoc:
|
63
|
+
io ||= @file
|
64
|
+
seek(pos, io) unless pos.nil?
|
65
|
+
io.read(1)[0]
|
66
|
+
end
|
67
|
+
|
68
|
+
def read_int16(pos = nil, io = nil) # :nodoc:
|
69
|
+
io ||= @file
|
70
|
+
seek(pos, io) unless pos.nil?
|
71
|
+
s = io.read(2)
|
72
|
+
(s[1] << 8) | s[0]
|
73
|
+
end
|
74
|
+
|
75
|
+
def read_int32(pos = nil, io = nil) # :nodoc:
|
76
|
+
io ||= @file
|
77
|
+
seek(pos, io) unless pos.nil?
|
78
|
+
s = io.read(4)
|
79
|
+
(s[3] << 24) |
|
80
|
+
(s[2] << 16) |
|
81
|
+
(s[1] << 8) |
|
82
|
+
s[0]
|
83
|
+
end
|
84
|
+
|
85
|
+
def read_int64(pos = nil, io = nil) # :nodoc:
|
86
|
+
io ||= @file
|
87
|
+
seek(pos, io) unless pos.nil?
|
88
|
+
s = io.read(8)
|
89
|
+
(s[7] << 56) |
|
90
|
+
(s[6] << 48) |
|
91
|
+
(s[5] << 40) |
|
92
|
+
(s[4] << 32) |
|
93
|
+
(s[3] << 24) |
|
94
|
+
(s[2] << 16) |
|
95
|
+
(s[1] << 8) |
|
96
|
+
s[0]
|
97
|
+
end
|
98
|
+
|
99
|
+
def read_str(size, pos = nil, io = nil) # :nodoc:
|
100
|
+
io ||= @file
|
101
|
+
seek(pos, io) unless pos.nil?
|
102
|
+
|
103
|
+
return '' if size == 0
|
104
|
+
|
105
|
+
io.read(size)
|
106
|
+
end
|
107
|
+
|
108
|
+
def read_cstr(pos = nil, io = nil) # :nodoc:
|
109
|
+
io ||= @file
|
110
|
+
seek(pos, io) unless pos.nil?
|
111
|
+
|
112
|
+
str = ''
|
113
|
+
begin
|
114
|
+
c = io.read(1)
|
115
|
+
str += c unless c[0] == 0
|
116
|
+
end while(c[0] != 0)
|
117
|
+
|
118
|
+
str
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
data/lib/zim/structs.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# header data of a zim file
|
4
|
+
class FileHeader
|
5
|
+
|
6
|
+
# magic number
|
7
|
+
attr_reader :magic
|
8
|
+
|
9
|
+
# zim version
|
10
|
+
attr_reader :version
|
11
|
+
|
12
|
+
# uuid
|
13
|
+
attr_reader :uuid
|
14
|
+
attr_reader :article_count
|
15
|
+
attr_reader :cluster_count
|
16
|
+
attr_reader :url_pos # :nodoc:
|
17
|
+
attr_reader :title_pos # :nodoc:
|
18
|
+
attr_reader :cluster_pos # :nodoc:
|
19
|
+
attr_reader :mime_list_pos # :nodoc:
|
20
|
+
|
21
|
+
# main page index
|
22
|
+
attr_reader :main_page
|
23
|
+
|
24
|
+
# layout page index
|
25
|
+
attr_reader :layout_page
|
26
|
+
attr_reader :checksum_pos # :nodoc:
|
27
|
+
|
28
|
+
# read file header from a file
|
29
|
+
def initialize(f)
|
30
|
+
@magic = f.read_int32
|
31
|
+
@version = f.read_int32
|
32
|
+
@uuid = f.read_str(16)
|
33
|
+
@article_count = f.read_int32
|
34
|
+
@cluster_count = f.read_int32
|
35
|
+
@url_pos = f.read_int64
|
36
|
+
@title_pos = f.read_int64
|
37
|
+
@cluster_pos = f.read_int64
|
38
|
+
@mime_list_pos = f.read_int64
|
39
|
+
@main_page = f.read_int32
|
40
|
+
@layout_page = f.read_int32
|
41
|
+
@checksum_pos = f.read_int64
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class StringList < Array
|
46
|
+
def initialize(f)
|
47
|
+
super()
|
48
|
+
|
49
|
+
str = nil
|
50
|
+
begin
|
51
|
+
str = f.read_cstr
|
52
|
+
self << str unless str.length == 0
|
53
|
+
end while str.length > 0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Directory < Array
|
58
|
+
def initialize(f, count, clazz)
|
59
|
+
super()
|
60
|
+
|
61
|
+
poses = Array.new
|
62
|
+
count.times do
|
63
|
+
pos = f.read_int64
|
64
|
+
poses << pos
|
65
|
+
end
|
66
|
+
|
67
|
+
poses.each do |pos|
|
68
|
+
f.seek(pos)
|
69
|
+
self << clazz.new(f)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class TitleList < Array
|
75
|
+
def initialize(f, count)
|
76
|
+
super()
|
77
|
+
|
78
|
+
count.times do
|
79
|
+
pos = f.read_int32
|
80
|
+
self << pos
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/zim/url.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# url entry class
|
4
|
+
class Url
|
5
|
+
|
6
|
+
attr_reader :mime_type
|
7
|
+
attr_reader :namespace
|
8
|
+
attr_reader :revision
|
9
|
+
attr_reader :redirect_index
|
10
|
+
attr_reader :cluster_number
|
11
|
+
attr_reader :blob_number
|
12
|
+
attr_reader :url
|
13
|
+
attr_reader :title
|
14
|
+
attr_reader :params
|
15
|
+
|
16
|
+
# read url from file
|
17
|
+
def initialize(f)
|
18
|
+
@f = f
|
19
|
+
mime_type = f.read_int16
|
20
|
+
|
21
|
+
if mime_type == 0xffff
|
22
|
+
# redirect
|
23
|
+
@mime_type = nil
|
24
|
+
param_len = f.read_int8
|
25
|
+
@namespace = f.read_int8.chr
|
26
|
+
@revision = f.read_int32
|
27
|
+
@redirect_index = f.read_int32
|
28
|
+
@cluster_number = nil
|
29
|
+
@blob_number = nil
|
30
|
+
@url = f.read_cstr
|
31
|
+
@title = f.read_cstr
|
32
|
+
@params = f.read_str(param_len)
|
33
|
+
else
|
34
|
+
@mime_type = f.mime_types[mime_type]
|
35
|
+
param_len = f.read_int8
|
36
|
+
@namespace = f.read_int8.chr
|
37
|
+
@revision = f.read_int32
|
38
|
+
@redirect_index = nil
|
39
|
+
@cluster_number = f.read_int32
|
40
|
+
@blob_number = f.read_int32
|
41
|
+
@url = f.read_cstr
|
42
|
+
@title = f.read_cstr
|
43
|
+
@params = f.read_str(param_len)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# checks if this url is a redirect
|
48
|
+
def redirect?
|
49
|
+
@mime_type.nil?
|
50
|
+
end
|
51
|
+
|
52
|
+
# receive the blob of this url
|
53
|
+
def blob
|
54
|
+
url = self
|
55
|
+
if redirect?
|
56
|
+
url = @f.urls[@redirect_index]
|
57
|
+
end
|
58
|
+
|
59
|
+
@f.clusters[@cluster_number].read_blob(@blob_number)
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
"/#{@namespace}/#{@url}"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
data/zim-ruby.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'zim-ruby'
|
5
|
+
s.version = '1.0.1'
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
s.authors = ['Christoph Plank']
|
8
|
+
s.email = ['chrisistuff@gmail.com']
|
9
|
+
s.homepage = 'http://rubygems.org/gems/zim-ruby'
|
10
|
+
s.summary = %q{Library to read openzim (wikipedia) files}
|
11
|
+
s.description = %q{This library can be used to read openzim files like the ones exported by wikipedia}
|
12
|
+
s.has_rdoc = false
|
13
|
+
|
14
|
+
#s.add_dependency('ruby-xz', '>= 0.0.2')
|
15
|
+
# added xz.rb directly to make it ruby 1.8 compatible
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.require_paths = ['lib']
|
19
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zim-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 1.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Christoph Plank
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-11-18 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: This library can be used to read openzim files like the ones exported by wikipedia
|
22
|
+
email:
|
23
|
+
- chrisistuff@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- .gitignore
|
32
|
+
- LICENSE
|
33
|
+
- README.markdown
|
34
|
+
- examples/unpack.rb
|
35
|
+
- lib/xz.rb
|
36
|
+
- lib/zim.rb
|
37
|
+
- lib/zim/cluster.rb
|
38
|
+
- lib/zim/exceptions.rb
|
39
|
+
- lib/zim/file.rb
|
40
|
+
- lib/zim/structs.rb
|
41
|
+
- lib/zim/url.rb
|
42
|
+
- zim-ruby.gemspec
|
43
|
+
homepage: http://rubygems.org/gems/zim-ruby
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project:
|
72
|
+
rubygems_version: 1.8.11
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: Library to read openzim (wikipedia) files
|
76
|
+
test_files: []
|
77
|
+
|