zim-ruby 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'stringio'
3
+ require 'xz'
4
+
5
+ require 'zim/file'
6
+ require 'zim/structs'
7
+ require 'zim/cluster'
8
+ require 'zim/exceptions'
9
+ require 'zim/url'
10
+
11
+ # Zim module
12
+ #
13
+ # Example:
14
+ # zim = Zim::Zimfile.new('test.zim')
15
+ # p zim.urls.first.blob
16
+ module Zim
17
+ end
18
+
@@ -0,0 +1,61 @@
1
+ module Zim
2
+
3
+ # entry in the cluster list
4
+ class ClusterEntry
5
+
6
+ # compression of the cluster
7
+ attr_reader :compression
8
+
9
+ # read a cluster from the file
10
+ def initialize(f)
11
+ @compression = f.read_int8
12
+
13
+ @f = f
14
+ @pos = f.tell
15
+ @offsets = nil
16
+ end
17
+
18
+ # read the blob +blob+ from the cluster
19
+ def read_blob(blob)
20
+ case @compression
21
+ when 0, 1
22
+ read_blob_uncompressed(nil, blob)
23
+ when 4
24
+ read_blob_lzma(blob)
25
+ else
26
+ raise UnknownCompression
27
+ end
28
+ end
29
+
30
+ def read_blob_uncompressed(io, blob, seek = true) # :nodoc:
31
+ if @offsets.nil?
32
+ @offsets = Array.new
33
+
34
+ @f.seek(@pos, io) if seek
35
+
36
+ off = @f.read_int32(nil, io)
37
+ count = (off >> 2) - 1
38
+
39
+ @offsets << off
40
+ count.times do
41
+ @offsets << @f.read_int32(nil, io)
42
+ end
43
+ end
44
+
45
+ raise InvalidBlobNumber if (blob < 0) || (blob >= (@offsets.count - 1))
46
+
47
+ off = @offsets[blob]
48
+ next_off = @offsets[blob + 1]
49
+ len = next_off - off
50
+ off += @pos if seek
51
+
52
+ @f.read_str(len, off, io)
53
+ end
54
+
55
+ def read_blob_lzma(blob) # :nodoc:
56
+ @f.seek(@pos)
57
+ io = StringIO.new(XZ.decompress_stream(@f.file))
58
+ read_blob_uncompressed(io, blob, false)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,15 @@
1
+ module Zim
2
+
3
+ # file contains an invalid magic
4
+ class InvalidZimMagic < Exception
5
+ end
6
+
7
+ # the cluster compression is not supported
8
+ class UnknownCompression < Exception
9
+ end
10
+
11
+ # the blob number is out of range
12
+ class InvalidBlobNumber < Exception
13
+ end
14
+ end
15
+
@@ -0,0 +1,122 @@
1
+ module Zim
2
+
3
+ # ZIM magic number
4
+ ZIM_MAGIC = 72173914
5
+
6
+ # Main class of the zim-ruby library
7
+ class ZimFile
8
+ # header informations (used internally)
9
+ attr_reader :header
10
+
11
+ # list of included mime types (used internally)
12
+ attr_reader :mime_types
13
+
14
+ # Directory of urls (used internally)
15
+ attr_reader :urls
16
+
17
+ # mapping of titles to urls (used internally)
18
+ attr_reader :titles
19
+
20
+ # cluster list (used internally)
21
+ attr_reader :clusters
22
+
23
+ # IO associated with this ZimFile
24
+ attr_reader :file
25
+
26
+ # load data from the given file
27
+ def initialize(filename)
28
+ @file = File.new(filename)
29
+
30
+ @header = FileHeader.new(self)
31
+ raise InvalidZimMagic if(@header.magic != ZIM_MAGIC)
32
+
33
+ seek(@header.mime_list_pos)
34
+ @mime_types = StringList.new(self)
35
+
36
+ seek(@header.url_pos)
37
+ @urls = Directory.new(self, @header.article_count, Url)
38
+
39
+ seek(@header.title_pos)
40
+ @titles = TitleList.new(self, @header.article_count)
41
+
42
+ seek(@header.cluster_pos)
43
+ @clusters = Directory.new(self, @header.cluster_count, ClusterEntry)
44
+ end
45
+
46
+ # access a url by full url (including namespace)
47
+ # e.g.: zim['/A/Table of Contents']
48
+ def [](idx)
49
+ @urls.detect { |x| x.to_s == idx }
50
+ end
51
+
52
+ def seek(pos, io = nil) # :nodoc:
53
+ io ||= @file
54
+ io.seek(pos)
55
+ end
56
+
57
+ def tell(io = nil) # :nodoc:
58
+ io ||= @file
59
+ io.tell
60
+ end
61
+
62
+ def read_int8(pos = nil, io = nil) # :nodoc:
63
+ io ||= @file
64
+ seek(pos, io) unless pos.nil?
65
+ io.read(1)[0]
66
+ end
67
+
68
+ def read_int16(pos = nil, io = nil) # :nodoc:
69
+ io ||= @file
70
+ seek(pos, io) unless pos.nil?
71
+ s = io.read(2)
72
+ (s[1] << 8) | s[0]
73
+ end
74
+
75
+ def read_int32(pos = nil, io = nil) # :nodoc:
76
+ io ||= @file
77
+ seek(pos, io) unless pos.nil?
78
+ s = io.read(4)
79
+ (s[3] << 24) |
80
+ (s[2] << 16) |
81
+ (s[1] << 8) |
82
+ s[0]
83
+ end
84
+
85
+ def read_int64(pos = nil, io = nil) # :nodoc:
86
+ io ||= @file
87
+ seek(pos, io) unless pos.nil?
88
+ s = io.read(8)
89
+ (s[7] << 56) |
90
+ (s[6] << 48) |
91
+ (s[5] << 40) |
92
+ (s[4] << 32) |
93
+ (s[3] << 24) |
94
+ (s[2] << 16) |
95
+ (s[1] << 8) |
96
+ s[0]
97
+ end
98
+
99
+ def read_str(size, pos = nil, io = nil) # :nodoc:
100
+ io ||= @file
101
+ seek(pos, io) unless pos.nil?
102
+
103
+ return '' if size == 0
104
+
105
+ io.read(size)
106
+ end
107
+
108
+ def read_cstr(pos = nil, io = nil) # :nodoc:
109
+ io ||= @file
110
+ seek(pos, io) unless pos.nil?
111
+
112
+ str = ''
113
+ begin
114
+ c = io.read(1)
115
+ str += c unless c[0] == 0
116
+ end while(c[0] != 0)
117
+
118
+ str
119
+ end
120
+ end
121
+ end
122
+
@@ -0,0 +1,84 @@
1
+ module Zim
2
+
3
+ # header data of a zim file
4
+ class FileHeader
5
+
6
+ # magic number
7
+ attr_reader :magic
8
+
9
+ # zim version
10
+ attr_reader :version
11
+
12
+ # uuid
13
+ attr_reader :uuid
14
+ attr_reader :article_count
15
+ attr_reader :cluster_count
16
+ attr_reader :url_pos # :nodoc:
17
+ attr_reader :title_pos # :nodoc:
18
+ attr_reader :cluster_pos # :nodoc:
19
+ attr_reader :mime_list_pos # :nodoc:
20
+
21
+ # main page index
22
+ attr_reader :main_page
23
+
24
+ # layout page index
25
+ attr_reader :layout_page
26
+ attr_reader :checksum_pos # :nodoc:
27
+
28
+ # read file header from a file
29
+ def initialize(f)
30
+ @magic = f.read_int32
31
+ @version = f.read_int32
32
+ @uuid = f.read_str(16)
33
+ @article_count = f.read_int32
34
+ @cluster_count = f.read_int32
35
+ @url_pos = f.read_int64
36
+ @title_pos = f.read_int64
37
+ @cluster_pos = f.read_int64
38
+ @mime_list_pos = f.read_int64
39
+ @main_page = f.read_int32
40
+ @layout_page = f.read_int32
41
+ @checksum_pos = f.read_int64
42
+ end
43
+ end
44
+
45
+ class StringList < Array
46
+ def initialize(f)
47
+ super()
48
+
49
+ str = nil
50
+ begin
51
+ str = f.read_cstr
52
+ self << str unless str.length == 0
53
+ end while str.length > 0
54
+ end
55
+ end
56
+
57
+ class Directory < Array
58
+ def initialize(f, count, clazz)
59
+ super()
60
+
61
+ poses = Array.new
62
+ count.times do
63
+ pos = f.read_int64
64
+ poses << pos
65
+ end
66
+
67
+ poses.each do |pos|
68
+ f.seek(pos)
69
+ self << clazz.new(f)
70
+ end
71
+ end
72
+ end
73
+
74
+ class TitleList < Array
75
+ def initialize(f, count)
76
+ super()
77
+
78
+ count.times do
79
+ pos = f.read_int32
80
+ self << pos
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,67 @@
1
+ module Zim
2
+
3
+ # url entry class
4
+ class Url
5
+
6
+ attr_reader :mime_type
7
+ attr_reader :namespace
8
+ attr_reader :revision
9
+ attr_reader :redirect_index
10
+ attr_reader :cluster_number
11
+ attr_reader :blob_number
12
+ attr_reader :url
13
+ attr_reader :title
14
+ attr_reader :params
15
+
16
+ # read url from file
17
+ def initialize(f)
18
+ @f = f
19
+ mime_type = f.read_int16
20
+
21
+ if mime_type == 0xffff
22
+ # redirect
23
+ @mime_type = nil
24
+ param_len = f.read_int8
25
+ @namespace = f.read_int8.chr
26
+ @revision = f.read_int32
27
+ @redirect_index = f.read_int32
28
+ @cluster_number = nil
29
+ @blob_number = nil
30
+ @url = f.read_cstr
31
+ @title = f.read_cstr
32
+ @params = f.read_str(param_len)
33
+ else
34
+ @mime_type = f.mime_types[mime_type]
35
+ param_len = f.read_int8
36
+ @namespace = f.read_int8.chr
37
+ @revision = f.read_int32
38
+ @redirect_index = nil
39
+ @cluster_number = f.read_int32
40
+ @blob_number = f.read_int32
41
+ @url = f.read_cstr
42
+ @title = f.read_cstr
43
+ @params = f.read_str(param_len)
44
+ end
45
+ end
46
+
47
+ # checks if this url is a redirect
48
+ def redirect?
49
+ @mime_type.nil?
50
+ end
51
+
52
+ # receive the blob of this url
53
+ def blob
54
+ url = self
55
+ if redirect?
56
+ url = @f.urls[@redirect_index]
57
+ end
58
+
59
+ @f.clusters[@cluster_number].read_blob(@blob_number)
60
+ end
61
+
62
+ def to_s
63
+ "/#{@namespace}/#{@url}"
64
+ end
65
+ end
66
+ end
67
+
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'zim-ruby'
5
+ s.version = '1.0.1'
6
+ s.platform = Gem::Platform::RUBY
7
+ s.authors = ['Christoph Plank']
8
+ s.email = ['chrisistuff@gmail.com']
9
+ s.homepage = 'http://rubygems.org/gems/zim-ruby'
10
+ s.summary = %q{Library to read openzim (wikipedia) files}
11
+ s.description = %q{This library can be used to read openzim files like the ones exported by wikipedia}
12
+ s.has_rdoc = false
13
+
14
+ #s.add_dependency('ruby-xz', '>= 0.0.2')
15
+ # added xz.rb directly to make it ruby 1.8 compatible
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.require_paths = ['lib']
19
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zim-ruby
3
+ version: !ruby/object:Gem::Version
4
+ hash: 21
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 1
10
+ version: 1.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Christoph Plank
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-11-18 00:00:00 Z
19
+ dependencies: []
20
+
21
+ description: This library can be used to read openzim files like the ones exported by wikipedia
22
+ email:
23
+ - chrisistuff@gmail.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - .gitignore
32
+ - LICENSE
33
+ - README.markdown
34
+ - examples/unpack.rb
35
+ - lib/xz.rb
36
+ - lib/zim.rb
37
+ - lib/zim/cluster.rb
38
+ - lib/zim/exceptions.rb
39
+ - lib/zim/file.rb
40
+ - lib/zim/structs.rb
41
+ - lib/zim/url.rb
42
+ - zim-ruby.gemspec
43
+ homepage: http://rubygems.org/gems/zim-ruby
44
+ licenses: []
45
+
46
+ post_install_message:
47
+ rdoc_options: []
48
+
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ hash: 3
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ requirements: []
70
+
71
+ rubyforge_project:
72
+ rubygems_version: 1.8.11
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Library to read openzim (wikipedia) files
76
+ test_files: []
77
+