web-page-archiver 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+ gem "mime-types", ">= 0"
6
+ gem "nokogiri", ">= 0"
7
+
8
+
9
+ # Add dependencies to develop your gem here.
10
+ # Include everything needed to run rake, tests, features, etc.
11
+ group :development do
12
+ gem "shoulda", ">= 0"
13
+ gem "bundler", "~> 1.0.0"
14
+ gem "jeweler", "~> 1.6.0"
15
+ gem "rcov", ">= 0"
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,24 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.6.0)
6
+ bundler (~> 1.0.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ mime-types (1.19)
10
+ nokogiri (1.4.4)
11
+ rake (0.8.7)
12
+ rcov (0.9.9)
13
+ shoulda (2.11.3)
14
+
15
+ PLATFORMS
16
+ ruby
17
+
18
+ DEPENDENCIES
19
+ bundler (~> 1.0.0)
20
+ jeweler (~> 1.6.0)
21
+ mime-types
22
+ nokogiri
23
+ rcov
24
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2012 murb
2
+ Portions copyright (c) 2011 takuya
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,21 @@
1
+ = Web page archiver
2
+
3
+ Web page archiver is a gem for building web archives. It supports mht (this gem is actually based on takuya's mht gem) and html with data urls (the rename from the original mht name is just to emphasize the support for this alternative format).
4
+ mht is known as mhtml that internet explorer's web page archive.
5
+ this package can make web archives from local files and URI's
6
+
7
+ == Contributing to web page archiver
8
+
9
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
10
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
11
+ * Fork the project
12
+ * Start a feature/bugfix branch
13
+ * Commit and push until you are happy with your contribution
14
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
15
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
16
+
17
+ == Copyright
18
+
19
+ Copyright (c) 2012 murb. See LICENSE.txt for further details.
20
+ Portions copyright (c) 2011 takuya. See LICENSE.txt for further details.
21
+
data/Rakefile ADDED
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "web-page-archiver"
18
+ gem.homepage = "http://github.com/murb/web-page-archiver"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{web page archiver creates self-containing, one file, html or mhtml files}
21
+ gem.description = %Q{web page archiver creates self-containing, one file, html or mhtml filese}
22
+ gem.email = "github.com+web-page-archiver@murb.nl"
23
+ gem.authors = ["takuya","murb"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ require 'rcov/rcovtask'
36
+ Rcov::RcovTask.new do |test|
37
+ test.libs << 'test'
38
+ test.pattern = 'test/**/test_*.rb'
39
+ test.verbose = true
40
+ test.rcov_opts << '--exclude "gems/*"'
41
+ end
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+ rdoc.options << '-c UTF8'
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "mht #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.5
Binary file
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html xmlns="http://www.w3.org/1999/xhtml">
4
+ <head>
5
+ <title>Test</title>
6
+ <script type="text/javascript" src="test.js"></script>
7
+ <link rel="stylesheet" href="style.css" type="text/css" charset="utf-8">
8
+ </head>
9
+ <body onload="test();">
10
+ <h1>test</h1>
11
+ <img src="image.png" alt="image"/>
12
+ </body>
13
+ </html>
@@ -0,0 +1,11 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2
+ <html xmlns="http://www.w3.org/1999/xhtml">
3
+ <head>
4
+ <title>Test</title>
5
+ <script type="text/javascript" src="data:application/javascript;base64,ZnVuY3Rpb24gdGVzdCgpIHsKCWFsZXJ0KCd0ZXN0Jyk7Cn0="></script><link rel="stylesheet" href="data:text/css;base64,aDEgewoJY29sb3I6IGdyZWVuOwp9" type="text/css" charset="utf-8">
6
+ </head>
7
+ <body onload="test();">
8
+ <h1>test</h1>
9
+ <img src="" alt="image">
10
+ </body>
11
+ </html>
@@ -0,0 +1,90 @@
1
+ Subject: Test
2
+ Content-Type: multipart/related; boundary=mimepart_bd61247b43556d6a368bf44e197457d5
3
+ Content-Location: fixtures/index.html
4
+ Date: Mon Jun 25 12:39:32 +0200 2012
5
+ MIME-Version: 1.0
6
+
7
+ mime mhtml content
8
+
9
+ --mimepart_bd61247b43556d6a368bf44e197457d5
10
+ Content-Disposition: inline; filename=default.htm
11
+ Content-Type: text/html
12
+ Content-Id: d1abb81a66869e855037522d0110e746
13
+ Content-Location: fixtures/index.html
14
+ Content-Transfer-Encoding: 8bit
15
+
16
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
17
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
18
+ <html xmlns="http://www.w3.org/1999/xhtml">
19
+ <head>
20
+ <title>Test</title>
21
+ <script type="text/javascript" src="test.js"></script>
22
+ <link rel="stylesheet" href="style.css" type="text/css" charset="utf-8">
23
+ </head>
24
+ <body onload="test();">
25
+ <h1>test</h1>
26
+ <img src="image.png" alt="image"/>
27
+ </body>
28
+ </html>
29
+
30
+ --mimepart_bd61247b43556d6a368bf44e197457d5
31
+ Content-Disposition: inline; filename=style.css
32
+ Content-Type: text/css
33
+ Content-Location: fixtures/style.css
34
+ Content-Transfer-Encoding: Base64
35
+ Content-Id: 49c35183dac4f31d192c35c09f2f9543
36
+
37
+ aDEgewoJY29sb3I6IGdyZWVuOwp9
38
+
39
+ --mimepart_bd61247b43556d6a368bf44e197457d5
40
+ Content-Disposition: inline; filename=test.js
41
+ Content-Type: application/javascript
42
+ Content-Location: fixtures/test.js
43
+ Content-Transfer-Encoding: Base64
44
+ Content-Id: f4f918b9efce07f37853e96d416a224d
45
+
46
+ ZnVuY3Rpb24gdGVzdCgpIHsKCWFsZXJ0KCd0ZXN0Jyk7Cn0=
47
+
48
+ --mimepart_bd61247b43556d6a368bf44e197457d5
49
+ Content-Disposition: inline; filename=image.png
50
+ Content-Type: image/png
51
+ Content-Location: fixtures/image.png
52
+ Content-Transfer-Encoding: Base64
53
+ Content-Id: 3f31c1d1dc59c8f85210bfd90830708b
54
+
55
+ iVBORw0KGgoAAAANSUhEUgAAAIAAAACACAYAAADDPmHLAAAACXBIWXMAABYl
56
+ AAAWJQFJUiTwAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccll
57
+ PAAABW9JREFUeNrsXVtyEzkUvXYIgeERYJ7wk7CCsAOzAy/BLIEdsIQsoWcH
58
+ XkLYQdhBh5/hjYHJACEkQarcTFzGdvxQS/dxTtWpcv+0bZ3TV1fSbYkIAAAA
59
+ AAAAAAAAcISW4/++EbjJn/cCn8MAtrAe+DDwAfMWX8+Cp4EfAp8x4/VHxAv5
60
+ 6AZuB9aBJ4lZ8727aGZZ6ARWgYMGRJ/EAX9nB81fDr2GnvRFIkMPcvgTHkYo
61
+ EOolCj/OCOgaEmfzfQXCj7LPvx1YMqsfKBR/OFnEqGFBVIqFH2UFOecL+buG
62
+ xD/jLrqEi7GlJNFbJkHcgsyTxR8YFn84L4AJnIoPE0B8mGA04asdij+cE7hO
63
+ DHcdiz88OsA43zndzRN0IfpPLDJjWKIiKPZ5e3RaoQOcI1YgbZKDyqM+nvap
64
+ C0im0YHIFzLrUnLuLqCm80pcYDxi93jf4h/r4ememT2LEQBPv+MogKdfcBTI
65
+ 9fRDVKczhMj8BY8I2hkM8MhBn/2YTl87+xtt9zOsL/WO9tUp1zgG2sXvks9E
66
+ LaUJVFcVbzvO0lOZYBvZv94hWgoT1FrFX8f4PJkJVFYNdSF+MhOozAOeQPz/
67
+ sbHkdz/RaICm1v3j0Ggnc36x7LTsslPhKusEdhoSf2sox9hVIH6KsvcdjQbI
68
+ UUfftAkkiH9G1waY9hJFUyaQJL5rA8zyBk1qE0gTX50BNjKKn9oEEsU/4TZV
69
+ g05m8VOZQKr42YtFJRhg0cmPRU0gWXx1BtgoHPLmNYF08VVOB5d+SWJWE2gQ
70
+ 3+0ooGrYBFrEdz0MbMoEmsRXaYAdwSbQJr7KqeDUi0GpTKBNfLWLQU0sB5fe
71
+ SKHUfkYql4ObKgipnImvtiCkyZKwypH4akvCImoDJigtvtqi0Iimy8Ir4+Kr
72
+ LwvPURhaGRZf/YshlKkRK6Piq381jCjffoCVMfHN7B/YUdRg0vYtNnP2UK3A
73
+ BNLEN7WFbE946JS4Y7m5I+hqoSaQKH5NBlFio6hKofgmn/5SUWCaCaSKX5Nh
74
+ lNowqlIivqnMfxJKbRbd58btCRY/+7o/touXAzfbxedaI8CBEZgixpEx0oFD
75
+ oxwfGnWWD+DYOOfAwZEAjo4FcHg0QDg+HqB8u3+VyPbXIa/PeYIKci4+Y6g5
76
+ Lxh4nOFrokvQeNpoHyE/LTpKEsSaHCzplkRPqBFqMlzJAyNMz+4hfOGuocqc
77
+ LA74OxHqBY4athuKDDXf21RW3zJshpiBPwx8wLzF17PgKZ1W6DxjxmuTlTot
78
+ x9EibkS5yZ/3Ap8jgAIA4Aseu4DLgWv8+SDwGwxgCyuBNwJ/CbwaeImvZ8G/
79
+ gd8DvwR+5usjGEBHxn+dM/21xPc+4BHBvsWRgGYDXAu8w6KvZPrOIzbD+8D/
80
+ YIAyuB34J/flJRFzh1ekfB8fTQaIT/ofAoQfZ4TXHBlggAYQk7l7AoUfZ4R/
81
+ OHmEARKgHXh3jgxeCuLI4UXgMQywXIL3F5tAI6L4LzUkihINEPv5m0ZGWZ84
82
+ P4AB5gj5a2QLB5K7BCkGWOUnf5Vs4pAjwSEMMFn8NtnGsUQTtASI/7sD8YdN
83
+ 8EaSCUoaIC7S/Eb+ViRjedlbOl10cmuA+L2/Ur45fGmIawrvSMB5gKUMcJsj
84
+ gGfECDDwaIDrBod6ywwR9z0ZYJUNAJxjv2RSmNsAN8l3JfKkpPCTBwNcLdTv
85
+ x772Gw8114Qa8KwMzawB2myAEg17MPI7rgg1wRcqMF2cqyFKNPoRja/4bQmN
86
+ BLEr+GrRAG3KP8d/RNMnWuL/llhgcpg7CuQwwKXMT9sxzVbKHX/TirBIcEKZ
87
+ ZwhbGe7fztyA8z5B0mYjjynjDGFL+f3HGUDD72zqfwAAAAAAAAAAAFyAHwIM
88
+ AIXWwzwh4hMJAAAAAElFTkSuQmCC
89
+
90
+ --mimepart_bd61247b43556d6a368bf44e197457d5--
@@ -0,0 +1,3 @@
1
+ h1 {
2
+ color: green;
3
+ }
data/fixtures/test.js ADDED
@@ -0,0 +1,3 @@
1
+ function test() {
2
+ alert('test');
3
+ }
@@ -0,0 +1,291 @@
1
+ # encoding: utf-8
2
+ # generate mhtml file
3
+ # == uri target uri
4
+ # return mhtml file
5
+ #mhtml = WebPageArchiver::MhtmlGenerator.generate("https://rubygems.org/")
6
+ #open("output.mht", "w+"){|f| f.write mhtml }
7
+ module WebPageArchiver
8
+ require 'rubygems'
9
+ require 'nokogiri'
10
+ require 'open-uri'
11
+ require 'digest/md5'
12
+ require 'stringio'
13
+ require 'base64'
14
+ require 'thread'
15
+ require 'mime/types'
16
+
17
+ module GeneratorHelpers
18
+ def initialize
19
+ @contents = {}
20
+ @src = StringIO.new
21
+ @boundary = "mimepart_#{Digest::MD5.hexdigest(Time.now.to_s)}"
22
+ @threads = []
23
+ @queue = Queue.new
24
+ @conf = { :base64_except=>["html"]}
25
+ end
26
+ def join_uri(base_filename_or_uri, path)
27
+ stream = open(base_filename_or_uri)
28
+ joined = ""
29
+ if stream.is_a? File
30
+ joined = URI::join("file://#{base_filename_or_uri}", path)
31
+ joined = joined.to_s.gsub('file://','').gsub('file:','')
32
+ else
33
+ joined = URI::join(base_filename_or_uri, path)
34
+ end
35
+ return joined.to_s
36
+ end
37
+ def content_type(f)
38
+ if f.is_a? File
39
+ return MIME::Types.type_for(f.path).first
40
+ else
41
+ return f.meta["content-type"]
42
+ end
43
+ end
44
+ def start_download_thread(num=5)
45
+ num.times{
46
+ t = Thread.start{
47
+ while(@queue.empty? == false)
48
+ k = @queue.pop
49
+ next if @contents[k][:body] != nil
50
+ v = @contents[k][:uri]
51
+ f = open(v)
52
+ @contents[k] = @contents[k].merge({ :body=>f.read, :uri=> v, :content_type=> content_type(f) })
53
+ end
54
+ }
55
+ @threads.push t
56
+ }
57
+ return @threads
58
+ end
59
+ def download_finished?
60
+ @contents.find{|k,v| v[:body] == nil } == nil
61
+ end
62
+ end
63
+
64
+ # == generate mhtml (mht) file
65
+ #
66
+ # mhtml = WebPageArchiver::MhtmlGenerator.generate("https://rubygems.org/")
67
+ # open("output.mht", "w+"){|f| f.write mhtml }
68
+ class MhtmlGenerator
69
+ include GeneratorHelpers
70
+ attr_accessor :conf
71
+ def MhtmlGenerator.generate(uri)
72
+ generateror = MhtmlGenerator.new
73
+ return generateror.convert(uri)
74
+ end
75
+ def convert(filename_or_uri)
76
+ f = open(filename_or_uri)
77
+ html = f.read
78
+ @parser = Nokogiri::HTML html
79
+ @src.puts "Subject: " + @parser.search("title").text()
80
+ @src.puts "Content-Type: multipart/related; boundary=#{@boundary}"
81
+ @src.puts "Content-Location: #{filename_or_uri}"
82
+ @src.puts "Date: #{Time.now.to_s}"
83
+ @src.puts "MIME-Version: 1.0"
84
+ @src.puts ""
85
+ @src.puts "mime mhtml content"
86
+ @src.puts ""
87
+ #imgs
88
+ @parser.search('img').each{|i|
89
+ uri = i.attr('src');
90
+ uri = join_uri( filename_or_uri, uri).to_s
91
+ uid = Digest::MD5.hexdigest(uri)
92
+ @contents[uid] = {:uri=>uri}
93
+ i.set_attribute('src',"cid:#{uid}")
94
+ }
95
+ #styles
96
+ @parser.search('link[rel=stylesheet]').each{|i|
97
+ uri = i.attr('href');
98
+ uri = join_uri( filename_or_uri, uri)
99
+ uid = Digest::MD5.hexdigest(uri)
100
+ @contents[uid] = {:uri=>uri}
101
+ i.set_attribute('href',"cid:#{uid}")
102
+ }
103
+ #scripts
104
+ @parser.search('script').map{ |i|
105
+ next unless i.attr('src');
106
+ uri = i.attr('src');
107
+ uri = join_uri( filename_or_uri, uri)
108
+ uid = Digest::MD5.hexdigest(uri)
109
+ @contents[uid] = {:uri=>uri}
110
+ i.set_attribute('src',"cid:#{uid}")
111
+ }
112
+ @src.puts "--#{@boundary}"
113
+ @src.puts "Content-Disposition: inline; filename=default.htm"
114
+ @src.puts "Content-Type: #{content_type(f)}"
115
+ @src.puts "Content-Id: #{Digest::MD5.hexdigest(filename_or_uri)}"
116
+ @src.puts "Content-Location: #{filename_or_uri}"
117
+ @src.puts "Content-Transfer-Encoding: 8bit" if @conf[:base64_except].find("html")
118
+ @src.puts "Content-Transfer-Encoding: Base64" unless @conf[:base64_except].find("html")
119
+ @src.puts ""
120
+ #@src.puts html
121
+ @src.puts "#{html}" if @conf[:base64_except].find("html")
122
+ #@src.puts "#{Base64.encode64(html)}" unless @conf[:base64_except].find("html")
123
+ @src.puts ""
124
+ self.attach_contents
125
+ @src.puts "--#{@boundary}--"
126
+ @src.rewind
127
+ return @src.read
128
+ end
129
+ def attach_contents
130
+ #prepeare_queue
131
+ @contents.each{|k,v| @queue.push k}
132
+ #start download threads
133
+ self.start_download_thread
134
+ # wait until download finished.
135
+ @threads.each{|t|t.join}
136
+ @contents.each{|k,v|self.add_html_content(k)}
137
+ end
138
+ def add_html_content(cid)
139
+ filename = File.basename(URI(@contents[cid][:uri]).path)
140
+ @src.puts "--#{@boundary}"
141
+ @src.puts "Content-Disposition: inline; filename=" + filename
142
+ @src.puts "Content-Type: #{@contents[cid][:content_type]}"
143
+ @src.puts "Content-Location: #{@contents[cid][:uri]}"
144
+ @src.puts "Content-Transfer-Encoding: Base64"
145
+ @src.puts "Content-Id: #{cid}"
146
+ @src.puts ""
147
+ @src.puts "#{Base64.encode64(@contents[cid][:body])}"
148
+ @src.puts ""
149
+ return
150
+ end
151
+ end
152
+
153
+
154
+ # == generate self-containing data-uri based html file (html) file
155
+ #
156
+ # mhtml = WebPageArchiver::DataUriHtmlGenerator.generate("https://rubygems.org/")
157
+ # open("output.html", "w+"){|f| f.write mhtml }
158
+ class DataUriHtmlGenerator
159
+ include GeneratorHelpers
160
+
161
+ attr_accessor :conf
162
+ def DataUriHtmlGenerator.generate(uri)
163
+ generateror = DataUriHtmlGenerator.new
164
+ return generateror.convert(uri)
165
+ end
166
+
167
+ def convert(filename_or_uri)
168
+ @parser = Nokogiri::HTML(open(filename_or_uri))
169
+ @parser.search('img').each{|i|
170
+ uri = i.attr('src');
171
+ uri = join_uri( filename_or_uri, uri).to_s
172
+ uid = Digest::MD5.hexdigest(uri)
173
+ @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
174
+ i.set_attribute('src',"cid:#{uid}")
175
+ }
176
+ #styles
177
+ @parser.search('link[rel=stylesheet]').each{|i|
178
+ uri = i.attr('href');
179
+ uri = join_uri( filename_or_uri, uri)
180
+ uid = Digest::MD5.hexdigest(uri)
181
+ @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
182
+ i.set_attribute('href',"cid:#{uid}")
183
+ }
184
+ #scripts
185
+ @parser.search('script').map{ |i|
186
+ next unless i.attr('src');
187
+ uri = i.attr('src');
188
+ uri = join_uri( filename_or_uri, uri)
189
+ uid = Digest::MD5.hexdigest(uri)
190
+ @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
191
+ i.set_attribute('src',"cid:#{uid}")
192
+ }
193
+ self.set_contents
194
+ return @parser.to_s
195
+ end
196
+
197
+ def set_contents
198
+ #prepeare_queue
199
+ @contents.each{|k,v| @queue.push k}
200
+ #start download threads
201
+ self.start_download_thread
202
+ # wait until download finished.
203
+ @threads.each{|t|t.join}
204
+ @contents.each do |k,v|
205
+ content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
206
+ tag=v[:parser_ref]
207
+ attribute=v[:attribute_name]
208
+ content_type=v[:content_type]
209
+ tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+
216
+ # == generate self-containing all-inline html file (html) file
217
+ #
218
+ # mhtml = WebPageArchiver::InlineHtmlGenerator.generate("https://rubygems.org/")
219
+ # open("output.html", "w+"){|f| f.write mhtml }
220
+ class InlineHtmlGenerator
221
+ include GeneratorHelpers
222
+
223
+ attr_accessor :conf
224
+ def InlineHtmlGenerator.generate(uri)
225
+ generateror = InlineHtmlGenerator.new
226
+ return generateror.convert(uri)
227
+ end
228
+
229
+ def convert(filename_or_uri)
230
+ @parser = Nokogiri::HTML(open(filename_or_uri))
231
+ @parser.search('img').each{|i|
232
+ uri = i.attr('src');
233
+ uri = join_uri( filename_or_uri, uri).to_s
234
+ uid = Digest::MD5.hexdigest(uri)
235
+ @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
236
+ i.set_attribute('src',"cid:#{uid}")
237
+ }
238
+ #styles
239
+ @parser.search('link[rel=stylesheet]').each{|i|
240
+ uri = i.attr('href');
241
+ uri = join_uri( filename_or_uri, uri)
242
+ uid = Digest::MD5.hexdigest(uri)
243
+ @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
244
+ i.set_attribute('href',"cid:#{uid}")
245
+ }
246
+ #scripts
247
+ @parser.search('script').map{ |i|
248
+ next unless i.attr('src');
249
+ uri = i.attr('src');
250
+ uri = join_uri( filename_or_uri, uri)
251
+ uid = Digest::MD5.hexdigest(uri)
252
+ @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
253
+ i.set_attribute('src',"cid:#{uid}")
254
+ }
255
+ self.set_contents
256
+ return @parser.to_s
257
+ end
258
+
259
+ def set_contents
260
+ #prepeare_queue
261
+ @contents.each{|k,v| @queue.push k}
262
+ #start download threads
263
+ self.start_download_thread
264
+ # wait until download finished.
265
+ @threads.each{|t|t.join}
266
+ @contents.each do |k,v|
267
+ tag=v[:parser_ref]
268
+ if tag.name == "script"
269
+ content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
270
+
271
+ attribute=v[:attribute_name]
272
+ content_type=v[:content_type]
273
+ tag.content=v[:body]
274
+ tag.remove_attribute(v[:attribute_name])
275
+ elsif tag.name == "link" and v[:content_type]="text/css"
276
+ tag.after("<style type=\"text/css\">#{v[:body]}</style>")
277
+ tag.remove()
278
+ else
279
+ # back to inline for non-script and style files...
280
+ content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
281
+ attribute=v[:attribute_name]
282
+ content_type=v[:content_type]
283
+ tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
284
+
285
+ end
286
+ end
287
+ end
288
+
289
+ end
290
+
291
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'web_page_archiver'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,30 @@
1
+ require 'helper'
2
+
3
+ class TestWebPageArchiver < Test::Unit::TestCase
4
+ def test_generate_mht_remote
5
+ mhtml = WebPageArchiver::MhtmlGenerator.generate("http://murb.github.com/web-page-archiver/static/")
6
+ assert(mhtml.match('touXAzfbxedaI8CBEZgixpEx0oFD'))
7
+ assert(mhtml.match(/Content-Disposition: inline; filename=test.js\nContent-Type: application\/(.*)javascript\nContent-Location: (.*)test.js\nContent-Transfer-Encoding: Base64\nContent-Id: (.*)\n\nZnVuY3Rpb24gdGVzdCgpIHsKCWFsZXJ0KCd0ZXN0Jyk7Cn0=/))
8
+ end
9
+
10
+ def test_generate_mht_local
11
+ mhtml = WebPageArchiver::MhtmlGenerator.generate("fixtures/index.html")
12
+ assert(mhtml.match('touXAzfbxedaI8CBEZgixpEx0oFD'))
13
+ assert(mhtml.match(/Content-Disposition: inline; filename=test.js\nContent-Type: application\/(.*)javascript\nContent-Location: (.*)test.js\nContent-Transfer-Encoding: Base64\nContent-Id: (.*)\n\nZnVuY3Rpb24gdGVzdCgpIHsKCWFsZXJ0KCd0ZXN0Jyk7Cn0=/))
14
+ end
15
+
16
+ def test_generate_html_local
17
+ mhtml = WebPageArchiver::DataUriHtmlGenerator.generate("fixtures/index.html")
18
+ assert(mhtml.match('touXAzfbxedaI8CBEZgixpEx0oFD'))
19
+ assert(mhtml.match('<img src="'))
20
+ assert(mhtml.match('<script type="text/javascript" src="data:application/javascript;base64,ZnVuY3Rpb24gdGVzdCgpIHsKCWFsZXJ0KCd0ZXN0Jyk7Cn0="></script><link rel="stylesheet" href="data:text/css;base64,aDEgewoJY29sb3I6IGdyZWVuOwp9" type="text/css" charset="utf-8">'))
21
+ end
22
+
23
+ def test_generate_inline_html_local
24
+ mhtml = WebPageArchiver::InlineHtmlGenerator.generate("fixtures/index.html")
25
+ assert(mhtml.match("alert"))
26
+ assert(mhtml.match('color: green;'))
27
+ assert(mhtml.match('<img src="'))
28
+ end
29
+
30
+ end
@@ -0,0 +1,72 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{web-page-archiver}
8
+ s.version = "0.0.4"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["takuya", "murb"]
12
+ s.date = %q{2012-08-21}
13
+ s.description = %q{web page archiver creates self-containing, one file, html or mhtml filese}
14
+ s.email = %q{github.com+web-page-archiver@murb.nl}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "fixtures/image.png",
28
+ "fixtures/index.html",
29
+ "fixtures/result.html",
30
+ "fixtures/result.mht",
31
+ "fixtures/style.css",
32
+ "fixtures/test.js",
33
+ "lib/web_page_archiver.rb",
34
+ "test/helper.rb",
35
+ "test/test_web_page_archiver.rb",
36
+ "web-page-archiver.gemspec"
37
+ ]
38
+ s.homepage = %q{http://github.com/murb/web-page-archiver}
39
+ s.licenses = ["MIT"]
40
+ s.require_paths = ["lib"]
41
+ s.rubygems_version = %q{1.3.6}
42
+ s.summary = %q{web page archiver creates self-containing, one file, html or mhtml files}
43
+
44
+ if s.respond_to? :specification_version then
45
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
46
+ s.specification_version = 3
47
+
48
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
49
+ s.add_runtime_dependency(%q<mime-types>, [">= 0"])
50
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
51
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
52
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.0"])
53
+ s.add_development_dependency(%q<rcov>, [">= 0"])
54
+ s.add_development_dependency(%q<nokogiri>, [">= 0"])
55
+ else
56
+ s.add_dependency(%q<mime-types>, [">= 0"])
57
+ s.add_dependency(%q<shoulda>, [">= 0"])
58
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
60
+ s.add_dependency(%q<rcov>, [">= 0"])
61
+ s.add_dependency(%q<nokogiri>, [">= 0"])
62
+ end
63
+ else
64
+ s.add_dependency(%q<mime-types>, [">= 0"])
65
+ s.add_dependency(%q<shoulda>, [">= 0"])
66
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
67
+ s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
68
+ s.add_dependency(%q<rcov>, [">= 0"])
69
+ s.add_dependency(%q<nokogiri>, [">= 0"])
70
+ end
71
+ end
72
+
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web-page-archiver
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 4
9
+ version: 0.0.4
10
+ platform: ruby
11
+ authors:
12
+ - takuya
13
+ - murb
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-08-21 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: mime-types
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: shoulda
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ segments:
41
+ - 0
42
+ version: "0"
43
+ type: :development
44
+ version_requirements: *id002
45
+ - !ruby/object:Gem::Dependency
46
+ name: bundler
47
+ prerelease: false
48
+ requirement: &id003 !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ~>
51
+ - !ruby/object:Gem::Version
52
+ segments:
53
+ - 1
54
+ - 0
55
+ - 0
56
+ version: 1.0.0
57
+ type: :development
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: jeweler
61
+ prerelease: false
62
+ requirement: &id004 !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ~>
65
+ - !ruby/object:Gem::Version
66
+ segments:
67
+ - 1
68
+ - 6
69
+ - 0
70
+ version: 1.6.0
71
+ type: :development
72
+ version_requirements: *id004
73
+ - !ruby/object:Gem::Dependency
74
+ name: rcov
75
+ prerelease: false
76
+ requirement: &id005 !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ type: :development
84
+ version_requirements: *id005
85
+ - !ruby/object:Gem::Dependency
86
+ name: nokogiri
87
+ prerelease: false
88
+ requirement: &id006 !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ segments:
93
+ - 0
94
+ version: "0"
95
+ type: :development
96
+ version_requirements: *id006
97
+ description: web page archiver creates self-containing, one file, html or mhtml filese
98
+ email: github.com+web-page-archiver@murb.nl
99
+ executables: []
100
+
101
+ extensions: []
102
+
103
+ extra_rdoc_files:
104
+ - LICENSE.txt
105
+ - README.rdoc
106
+ files:
107
+ - .document
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.rdoc
112
+ - Rakefile
113
+ - VERSION
114
+ - fixtures/image.png
115
+ - fixtures/index.html
116
+ - fixtures/result.html
117
+ - fixtures/result.mht
118
+ - fixtures/style.css
119
+ - fixtures/test.js
120
+ - lib/web_page_archiver.rb
121
+ - test/helper.rb
122
+ - test/test_web_page_archiver.rb
123
+ - web-page-archiver.gemspec
124
+ has_rdoc: true
125
+ homepage: http://github.com/murb/web-page-archiver
126
+ licenses:
127
+ - MIT
128
+ post_install_message:
129
+ rdoc_options: []
130
+
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ segments:
138
+ - 0
139
+ version: "0"
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ segments:
145
+ - 0
146
+ version: "0"
147
+ requirements: []
148
+
149
+ rubyforge_project:
150
+ rubygems_version: 1.3.6
151
+ signing_key:
152
+ specification_version: 3
153
+ summary: web page archiver creates self-containing, one file, html or mhtml files
154
+ test_files: []
155
+