zim-ruby 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/LICENSE +674 -0
- data/README.markdown +3 -0
- data/examples/unpack.rb +20 -0
- data/lib/xz.rb +462 -0
- data/lib/zim.rb +18 -0
- data/lib/zim/cluster.rb +61 -0
- data/lib/zim/exceptions.rb +15 -0
- data/lib/zim/file.rb +122 -0
- data/lib/zim/structs.rb +84 -0
- data/lib/zim/url.rb +67 -0
- data/zim-ruby.gemspec +19 -0
- metadata +77 -0
data/lib/zim.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'stringio'
|
3
|
+
require 'xz'
|
4
|
+
|
5
|
+
require 'zim/file'
|
6
|
+
require 'zim/structs'
|
7
|
+
require 'zim/cluster'
|
8
|
+
require 'zim/exceptions'
|
9
|
+
require 'zim/url'
|
10
|
+
|
11
|
+
# Zim module
|
12
|
+
#
|
13
|
+
# Example:
|
14
|
+
# zim = Zim::Zimfile.new('test.zim')
|
15
|
+
# p zim.urls.first.blob
|
16
|
+
module Zim
|
17
|
+
end
|
18
|
+
|
data/lib/zim/cluster.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# entry in the cluster list
|
4
|
+
class ClusterEntry
|
5
|
+
|
6
|
+
# compression of the cluster
|
7
|
+
attr_reader :compression
|
8
|
+
|
9
|
+
# read a cluster from the file
|
10
|
+
def initialize(f)
|
11
|
+
@compression = f.read_int8
|
12
|
+
|
13
|
+
@f = f
|
14
|
+
@pos = f.tell
|
15
|
+
@offsets = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# read the blob +blob+ from the cluster
|
19
|
+
def read_blob(blob)
|
20
|
+
case @compression
|
21
|
+
when 0, 1
|
22
|
+
read_blob_uncompressed(nil, blob)
|
23
|
+
when 4
|
24
|
+
read_blob_lzma(blob)
|
25
|
+
else
|
26
|
+
raise UnknownCompression
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def read_blob_uncompressed(io, blob, seek = true) # :nodoc:
|
31
|
+
if @offsets.nil?
|
32
|
+
@offsets = Array.new
|
33
|
+
|
34
|
+
@f.seek(@pos, io) if seek
|
35
|
+
|
36
|
+
off = @f.read_int32(nil, io)
|
37
|
+
count = (off >> 2) - 1
|
38
|
+
|
39
|
+
@offsets << off
|
40
|
+
count.times do
|
41
|
+
@offsets << @f.read_int32(nil, io)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
raise InvalidBlobNumber if (blob < 0) || (blob >= (@offsets.count - 1))
|
46
|
+
|
47
|
+
off = @offsets[blob]
|
48
|
+
next_off = @offsets[blob + 1]
|
49
|
+
len = next_off - off
|
50
|
+
off += @pos if seek
|
51
|
+
|
52
|
+
@f.read_str(len, off, io)
|
53
|
+
end
|
54
|
+
|
55
|
+
def read_blob_lzma(blob) # :nodoc:
|
56
|
+
@f.seek(@pos)
|
57
|
+
io = StringIO.new(XZ.decompress_stream(@f.file))
|
58
|
+
read_blob_uncompressed(io, blob, false)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# file contains an invalid magic
|
4
|
+
class InvalidZimMagic < Exception
|
5
|
+
end
|
6
|
+
|
7
|
+
# the cluster compression is not supported
|
8
|
+
class UnknownCompression < Exception
|
9
|
+
end
|
10
|
+
|
11
|
+
# the blob number is out of range
|
12
|
+
class InvalidBlobNumber < Exception
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
data/lib/zim/file.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# ZIM magic number
|
4
|
+
ZIM_MAGIC = 72173914
|
5
|
+
|
6
|
+
# Main class of the zim-ruby library
|
7
|
+
class ZimFile
|
8
|
+
# header informations (used internally)
|
9
|
+
attr_reader :header
|
10
|
+
|
11
|
+
# list of included mime types (used internally)
|
12
|
+
attr_reader :mime_types
|
13
|
+
|
14
|
+
# Directory of urls (used internally)
|
15
|
+
attr_reader :urls
|
16
|
+
|
17
|
+
# mapping of titles to urls (used internally)
|
18
|
+
attr_reader :titles
|
19
|
+
|
20
|
+
# cluster list (used internally)
|
21
|
+
attr_reader :clusters
|
22
|
+
|
23
|
+
# IO associated with this ZimFile
|
24
|
+
attr_reader :file
|
25
|
+
|
26
|
+
# load data from the given file
|
27
|
+
def initialize(filename)
|
28
|
+
@file = File.new(filename)
|
29
|
+
|
30
|
+
@header = FileHeader.new(self)
|
31
|
+
raise InvalidZimMagic if(@header.magic != ZIM_MAGIC)
|
32
|
+
|
33
|
+
seek(@header.mime_list_pos)
|
34
|
+
@mime_types = StringList.new(self)
|
35
|
+
|
36
|
+
seek(@header.url_pos)
|
37
|
+
@urls = Directory.new(self, @header.article_count, Url)
|
38
|
+
|
39
|
+
seek(@header.title_pos)
|
40
|
+
@titles = TitleList.new(self, @header.article_count)
|
41
|
+
|
42
|
+
seek(@header.cluster_pos)
|
43
|
+
@clusters = Directory.new(self, @header.cluster_count, ClusterEntry)
|
44
|
+
end
|
45
|
+
|
46
|
+
# access a url by full url (including namespace)
|
47
|
+
# e.g.: zim['/A/Table of Contents']
|
48
|
+
def [](idx)
|
49
|
+
@urls.detect { |x| x.to_s == idx }
|
50
|
+
end
|
51
|
+
|
52
|
+
def seek(pos, io = nil) # :nodoc:
|
53
|
+
io ||= @file
|
54
|
+
io.seek(pos)
|
55
|
+
end
|
56
|
+
|
57
|
+
def tell(io = nil) # :nodoc:
|
58
|
+
io ||= @file
|
59
|
+
io.tell
|
60
|
+
end
|
61
|
+
|
62
|
+
def read_int8(pos = nil, io = nil) # :nodoc:
|
63
|
+
io ||= @file
|
64
|
+
seek(pos, io) unless pos.nil?
|
65
|
+
io.read(1)[0]
|
66
|
+
end
|
67
|
+
|
68
|
+
def read_int16(pos = nil, io = nil) # :nodoc:
|
69
|
+
io ||= @file
|
70
|
+
seek(pos, io) unless pos.nil?
|
71
|
+
s = io.read(2)
|
72
|
+
(s[1] << 8) | s[0]
|
73
|
+
end
|
74
|
+
|
75
|
+
def read_int32(pos = nil, io = nil) # :nodoc:
|
76
|
+
io ||= @file
|
77
|
+
seek(pos, io) unless pos.nil?
|
78
|
+
s = io.read(4)
|
79
|
+
(s[3] << 24) |
|
80
|
+
(s[2] << 16) |
|
81
|
+
(s[1] << 8) |
|
82
|
+
s[0]
|
83
|
+
end
|
84
|
+
|
85
|
+
def read_int64(pos = nil, io = nil) # :nodoc:
|
86
|
+
io ||= @file
|
87
|
+
seek(pos, io) unless pos.nil?
|
88
|
+
s = io.read(8)
|
89
|
+
(s[7] << 56) |
|
90
|
+
(s[6] << 48) |
|
91
|
+
(s[5] << 40) |
|
92
|
+
(s[4] << 32) |
|
93
|
+
(s[3] << 24) |
|
94
|
+
(s[2] << 16) |
|
95
|
+
(s[1] << 8) |
|
96
|
+
s[0]
|
97
|
+
end
|
98
|
+
|
99
|
+
def read_str(size, pos = nil, io = nil) # :nodoc:
|
100
|
+
io ||= @file
|
101
|
+
seek(pos, io) unless pos.nil?
|
102
|
+
|
103
|
+
return '' if size == 0
|
104
|
+
|
105
|
+
io.read(size)
|
106
|
+
end
|
107
|
+
|
108
|
+
def read_cstr(pos = nil, io = nil) # :nodoc:
|
109
|
+
io ||= @file
|
110
|
+
seek(pos, io) unless pos.nil?
|
111
|
+
|
112
|
+
str = ''
|
113
|
+
begin
|
114
|
+
c = io.read(1)
|
115
|
+
str += c unless c[0] == 0
|
116
|
+
end while(c[0] != 0)
|
117
|
+
|
118
|
+
str
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
data/lib/zim/structs.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# header data of a zim file
|
4
|
+
class FileHeader
|
5
|
+
|
6
|
+
# magic number
|
7
|
+
attr_reader :magic
|
8
|
+
|
9
|
+
# zim version
|
10
|
+
attr_reader :version
|
11
|
+
|
12
|
+
# uuid
|
13
|
+
attr_reader :uuid
|
14
|
+
attr_reader :article_count
|
15
|
+
attr_reader :cluster_count
|
16
|
+
attr_reader :url_pos # :nodoc:
|
17
|
+
attr_reader :title_pos # :nodoc:
|
18
|
+
attr_reader :cluster_pos # :nodoc:
|
19
|
+
attr_reader :mime_list_pos # :nodoc:
|
20
|
+
|
21
|
+
# main page index
|
22
|
+
attr_reader :main_page
|
23
|
+
|
24
|
+
# layout page index
|
25
|
+
attr_reader :layout_page
|
26
|
+
attr_reader :checksum_pos # :nodoc:
|
27
|
+
|
28
|
+
# read file header from a file
|
29
|
+
def initialize(f)
|
30
|
+
@magic = f.read_int32
|
31
|
+
@version = f.read_int32
|
32
|
+
@uuid = f.read_str(16)
|
33
|
+
@article_count = f.read_int32
|
34
|
+
@cluster_count = f.read_int32
|
35
|
+
@url_pos = f.read_int64
|
36
|
+
@title_pos = f.read_int64
|
37
|
+
@cluster_pos = f.read_int64
|
38
|
+
@mime_list_pos = f.read_int64
|
39
|
+
@main_page = f.read_int32
|
40
|
+
@layout_page = f.read_int32
|
41
|
+
@checksum_pos = f.read_int64
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class StringList < Array
|
46
|
+
def initialize(f)
|
47
|
+
super()
|
48
|
+
|
49
|
+
str = nil
|
50
|
+
begin
|
51
|
+
str = f.read_cstr
|
52
|
+
self << str unless str.length == 0
|
53
|
+
end while str.length > 0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Directory < Array
|
58
|
+
def initialize(f, count, clazz)
|
59
|
+
super()
|
60
|
+
|
61
|
+
poses = Array.new
|
62
|
+
count.times do
|
63
|
+
pos = f.read_int64
|
64
|
+
poses << pos
|
65
|
+
end
|
66
|
+
|
67
|
+
poses.each do |pos|
|
68
|
+
f.seek(pos)
|
69
|
+
self << clazz.new(f)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class TitleList < Array
|
75
|
+
def initialize(f, count)
|
76
|
+
super()
|
77
|
+
|
78
|
+
count.times do
|
79
|
+
pos = f.read_int32
|
80
|
+
self << pos
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/zim/url.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
module Zim
|
2
|
+
|
3
|
+
# url entry class
|
4
|
+
class Url
|
5
|
+
|
6
|
+
attr_reader :mime_type
|
7
|
+
attr_reader :namespace
|
8
|
+
attr_reader :revision
|
9
|
+
attr_reader :redirect_index
|
10
|
+
attr_reader :cluster_number
|
11
|
+
attr_reader :blob_number
|
12
|
+
attr_reader :url
|
13
|
+
attr_reader :title
|
14
|
+
attr_reader :params
|
15
|
+
|
16
|
+
# read url from file
|
17
|
+
def initialize(f)
|
18
|
+
@f = f
|
19
|
+
mime_type = f.read_int16
|
20
|
+
|
21
|
+
if mime_type == 0xffff
|
22
|
+
# redirect
|
23
|
+
@mime_type = nil
|
24
|
+
param_len = f.read_int8
|
25
|
+
@namespace = f.read_int8.chr
|
26
|
+
@revision = f.read_int32
|
27
|
+
@redirect_index = f.read_int32
|
28
|
+
@cluster_number = nil
|
29
|
+
@blob_number = nil
|
30
|
+
@url = f.read_cstr
|
31
|
+
@title = f.read_cstr
|
32
|
+
@params = f.read_str(param_len)
|
33
|
+
else
|
34
|
+
@mime_type = f.mime_types[mime_type]
|
35
|
+
param_len = f.read_int8
|
36
|
+
@namespace = f.read_int8.chr
|
37
|
+
@revision = f.read_int32
|
38
|
+
@redirect_index = nil
|
39
|
+
@cluster_number = f.read_int32
|
40
|
+
@blob_number = f.read_int32
|
41
|
+
@url = f.read_cstr
|
42
|
+
@title = f.read_cstr
|
43
|
+
@params = f.read_str(param_len)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# checks if this url is a redirect
|
48
|
+
def redirect?
|
49
|
+
@mime_type.nil?
|
50
|
+
end
|
51
|
+
|
52
|
+
# receive the blob of this url
|
53
|
+
def blob
|
54
|
+
url = self
|
55
|
+
if redirect?
|
56
|
+
url = @f.urls[@redirect_index]
|
57
|
+
end
|
58
|
+
|
59
|
+
@f.clusters[@cluster_number].read_blob(@blob_number)
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
"/#{@namespace}/#{@url}"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
data/zim-ruby.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'zim-ruby'
|
5
|
+
s.version = '1.0.1'
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
s.authors = ['Christoph Plank']
|
8
|
+
s.email = ['chrisistuff@gmail.com']
|
9
|
+
s.homepage = 'http://rubygems.org/gems/zim-ruby'
|
10
|
+
s.summary = %q{Library to read openzim (wikipedia) files}
|
11
|
+
s.description = %q{This library can be used to read openzim files like the ones exported by wikipedia}
|
12
|
+
s.has_rdoc = false
|
13
|
+
|
14
|
+
#s.add_dependency('ruby-xz', '>= 0.0.2')
|
15
|
+
# added xz.rb directly to make it ruby 1.8 compatible
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.require_paths = ['lib']
|
19
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zim-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 1.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Christoph Plank
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-11-18 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: This library can be used to read openzim files like the ones exported by wikipedia
|
22
|
+
email:
|
23
|
+
- chrisistuff@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- .gitignore
|
32
|
+
- LICENSE
|
33
|
+
- README.markdown
|
34
|
+
- examples/unpack.rb
|
35
|
+
- lib/xz.rb
|
36
|
+
- lib/zim.rb
|
37
|
+
- lib/zim/cluster.rb
|
38
|
+
- lib/zim/exceptions.rb
|
39
|
+
- lib/zim/file.rb
|
40
|
+
- lib/zim/structs.rb
|
41
|
+
- lib/zim/url.rb
|
42
|
+
- zim-ruby.gemspec
|
43
|
+
homepage: http://rubygems.org/gems/zim-ruby
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project:
|
72
|
+
rubygems_version: 1.8.11
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: Library to read openzim (wikipedia) files
|
76
|
+
test_files: []
|
77
|
+
|