web-page-archiver 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -3
- data/Gemfile.lock +28 -10
- data/{README.rdoc → README.md} +3 -3
- data/Rakefile +0 -19
- data/VERSION +1 -1
- data/lib/web_page_archiver.rb +206 -149
- data/test/test_web_page_archiver.rb +22 -0
- data/web-page-archiver.gemspec +20 -24
- metadata +95 -104
data/Gemfile
CHANGED
@@ -10,7 +10,6 @@ gem "nokogiri", ">= 0"
|
|
10
10
|
# Include everything needed to run rake, tests, features, etc.
|
11
11
|
group :development do
|
12
12
|
gem "shoulda", ">= 0"
|
13
|
-
gem "bundler", "~> 1
|
14
|
-
gem "jeweler", "~> 1
|
15
|
-
gem "rcov", ">= 0"
|
13
|
+
gem "bundler", "~> 1"
|
14
|
+
gem "jeweler", "~> 1"
|
16
15
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,24 +1,42 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
+
activesupport (3.2.13)
|
5
|
+
i18n (= 0.6.1)
|
6
|
+
multi_json (~> 1.0)
|
7
|
+
bourne (1.4.0)
|
8
|
+
mocha (~> 0.13.2)
|
4
9
|
git (1.2.5)
|
5
|
-
|
6
|
-
|
10
|
+
i18n (0.6.1)
|
11
|
+
jeweler (1.8.4)
|
12
|
+
bundler (~> 1.0)
|
7
13
|
git (>= 1.2.5)
|
8
14
|
rake
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
15
|
+
rdoc
|
16
|
+
json (1.7.7)
|
17
|
+
metaclass (0.0.1)
|
18
|
+
mime-types (1.22)
|
19
|
+
mocha (0.13.3)
|
20
|
+
metaclass (~> 0.0.1)
|
21
|
+
multi_json (1.7.2)
|
22
|
+
nokogiri (1.5.9)
|
23
|
+
rake (10.0.4)
|
24
|
+
rdoc (4.0.1)
|
25
|
+
json (~> 1.4)
|
26
|
+
shoulda (3.4.0)
|
27
|
+
shoulda-context (~> 1.0, >= 1.0.1)
|
28
|
+
shoulda-matchers (~> 1.0, >= 1.4.1)
|
29
|
+
shoulda-context (1.1.0)
|
30
|
+
shoulda-matchers (1.5.6)
|
31
|
+
activesupport (>= 3.0.0)
|
32
|
+
bourne (~> 1.3)
|
14
33
|
|
15
34
|
PLATFORMS
|
16
35
|
ruby
|
17
36
|
|
18
37
|
DEPENDENCIES
|
19
|
-
bundler (~> 1
|
20
|
-
jeweler (~> 1
|
38
|
+
bundler (~> 1)
|
39
|
+
jeweler (~> 1)
|
21
40
|
mime-types
|
22
41
|
nokogiri
|
23
|
-
rcov
|
24
42
|
shoulda
|
data/{README.rdoc → README.md}
RENAMED
@@ -1,10 +1,10 @@
|
|
1
|
-
|
1
|
+
# Web page archiver
|
2
2
|
|
3
3
|
Web page archiver is a gem for building web archives. It supports mht (this gem is actually based on takuya's mht gem) and html with data urls (the rename from the original mht name is just to emphasize the support for this alternative format).
|
4
4
|
mht is known as mhtml that internet explorer's web page archive.
|
5
5
|
this package can make web archives from local files and URI's
|
6
6
|
|
7
|
-
|
7
|
+
## Contributing to web page archiver
|
8
8
|
|
9
9
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
10
10
|
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
@@ -14,7 +14,7 @@ this package can make web archives from local files and URI's
|
|
14
14
|
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
15
15
|
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
16
16
|
|
17
|
-
|
17
|
+
## Copyright
|
18
18
|
|
19
19
|
Copyright (c) 2012 murb. See LICENSE.txt for further details.
|
20
20
|
Portions copyright (c) 2011 takuya. See LICENSE.txt for further details.
|
data/Rakefile
CHANGED
@@ -32,23 +32,4 @@ Rake::TestTask.new(:test) do |test|
|
|
32
32
|
test.verbose = true
|
33
33
|
end
|
34
34
|
|
35
|
-
require 'rcov/rcovtask'
|
36
|
-
Rcov::RcovTask.new do |test|
|
37
|
-
test.libs << 'test'
|
38
|
-
test.pattern = 'test/**/test_*.rb'
|
39
|
-
test.verbose = true
|
40
|
-
test.rcov_opts << '--exclude "gems/*"'
|
41
|
-
end
|
42
|
-
|
43
35
|
task :default => :test
|
44
|
-
|
45
|
-
require 'rake/rdoctask'
|
46
|
-
Rake::RDocTask.new do |rdoc|
|
47
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
-
rdoc.options << '-c UTF8'
|
49
|
-
|
50
|
-
rdoc.rdoc_dir = 'rdoc'
|
51
|
-
rdoc.title = "mht #{version}"
|
52
|
-
rdoc.rdoc_files.include('README*')
|
53
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
-
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
data/lib/web_page_archiver.rb
CHANGED
@@ -1,19 +1,17 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
# == uri target uri
|
4
|
-
# return mhtml file
|
5
|
-
#mhtml = WebPageArchiver::MhtmlGenerator.generate("https://rubygems.org/")
|
6
|
-
#open("output.mht", "w+"){|f| f.write mhtml }
|
2
|
+
|
7
3
|
module WebPageArchiver
|
8
|
-
|
9
|
-
require 'nokogiri'
|
10
|
-
require 'open-uri'
|
11
|
-
require 'digest/md5'
|
12
|
-
require 'stringio'
|
13
|
-
require 'base64'
|
14
|
-
require 'thread'
|
15
|
-
require 'mime/types'
|
4
|
+
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'digest/md5'
|
8
|
+
require 'stringio'
|
9
|
+
require 'base64'
|
10
|
+
require 'thread'
|
11
|
+
require 'mime/types'
|
16
12
|
|
13
|
+
# Generic methods
|
14
|
+
# To reuse in both the MhtmlGenerator as the InlineHtmlGenerator
|
17
15
|
module GeneratorHelpers
|
18
16
|
def initialize
|
19
17
|
@contents = {}
|
@@ -21,26 +19,54 @@ require 'mime/types'
|
|
21
19
|
@boundary = "mimepart_#{Digest::MD5.hexdigest(Time.now.to_s)}"
|
22
20
|
@threads = []
|
23
21
|
@queue = Queue.new
|
24
|
-
@conf = { :base64_except=>["html"]}
|
22
|
+
@conf = { :base64_except=>["html"] }
|
25
23
|
end
|
24
|
+
|
25
|
+
# Creates a absolute URI-string for referenced resources in base file name
|
26
|
+
#
|
27
|
+
# @param [String, URI] base_filename_or_uri from where the resource is linked
|
28
|
+
# @param [String] path of the resource (relative or absolute) within the parent resource
|
29
|
+
# @return [String] URI-string
|
26
30
|
def join_uri(base_filename_or_uri, path)
|
27
31
|
stream = open(base_filename_or_uri)
|
28
32
|
joined = ""
|
29
33
|
if stream.is_a? File
|
34
|
+
base_filename_or_uri = base_filename_or_uri.path if base_filename_or_uri.is_a? File
|
35
|
+
|
36
|
+
windows_drive_matcher = /((.*):\/)/
|
37
|
+
windows_drive_match_data = base_filename_or_uri.match windows_drive_matcher
|
38
|
+
if windows_drive_match_data
|
39
|
+
base_filename_or_uri.gsub!(windows_drive_matcher,'WINDOWS.DRIVE/')
|
40
|
+
end
|
41
|
+
|
30
42
|
joined = URI::join("file://#{base_filename_or_uri}", path)
|
31
43
|
joined = joined.to_s.gsub('file://','').gsub('file:','')
|
44
|
+
|
45
|
+
if windows_drive_match_data
|
46
|
+
joined = joined.gsub('WINDOWS.DRIVE/',windows_drive_match_data[1])
|
47
|
+
end
|
32
48
|
else
|
33
49
|
joined = URI::join(base_filename_or_uri, path)
|
34
50
|
end
|
35
51
|
return joined.to_s
|
36
52
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
53
|
+
|
54
|
+
# Determines the conttent type of a file or download
|
55
|
+
#
|
56
|
+
# @param [File,URI] object to test
|
57
|
+
# @return [String] mime-type / content type
|
58
|
+
def content_type(object)
|
59
|
+
if object.is_a? File
|
60
|
+
return MIME::Types.type_for(object.path).first
|
40
61
|
else
|
41
|
-
return
|
62
|
+
return object.meta["content-type"]
|
42
63
|
end
|
43
64
|
end
|
65
|
+
|
66
|
+
# Processes the download queue
|
67
|
+
#
|
68
|
+
# @param [Integer] num number of threads
|
69
|
+
# @return [Array<Thread>] the ruby-threads opened
|
44
70
|
def start_download_thread(num=5)
|
45
71
|
num.times{
|
46
72
|
t = Thread.start{
|
@@ -56,22 +82,34 @@ require 'mime/types'
|
|
56
82
|
}
|
57
83
|
return @threads
|
58
84
|
end
|
85
|
+
|
86
|
+
# Tests wether all the required content has been downloaded
|
59
87
|
def download_finished?
|
60
88
|
@contents.find{|k,v| v[:body] == nil } == nil
|
61
89
|
end
|
62
90
|
end
|
63
91
|
|
64
|
-
#
|
65
|
-
#
|
66
|
-
# mhtml = WebPageArchiver::MhtmlGenerator.generate("https://rubygems.org/")
|
67
|
-
# open("output.mht", "w+"){|f| f.write mhtml }
|
92
|
+
# generates mht-files
|
68
93
|
class MhtmlGenerator
|
69
94
|
include GeneratorHelpers
|
70
95
|
attr_accessor :conf
|
71
|
-
|
72
|
-
|
73
|
-
|
96
|
+
|
97
|
+
# generate mhtml (mht) file without instantiating a MhtmlGenerator object
|
98
|
+
#
|
99
|
+
# mhtml = WebPageArchiver::MhtmlGenerator.generate("https://rubygems.org/")
|
100
|
+
# open("output.mht", "w+"){|f| f.write mhtml }
|
101
|
+
#
|
102
|
+
# @param [String, URI] filename_or_uri to test for
|
103
|
+
# @return [String] text blob containing the result
|
104
|
+
def MhtmlGenerator.generate(filename_or_uri)
|
105
|
+
generator = MhtmlGenerator.new
|
106
|
+
return generator.convert(filename_or_uri)
|
74
107
|
end
|
108
|
+
|
109
|
+
# convert object at uri to self-contained text-file
|
110
|
+
#
|
111
|
+
# @param [String, URI] filename_or_uri to test for
|
112
|
+
# @return [String] text blob containing the result
|
75
113
|
def convert(filename_or_uri)
|
76
114
|
f = open(filename_or_uri)
|
77
115
|
html = f.read
|
@@ -126,6 +164,8 @@ require 'mime/types'
|
|
126
164
|
@src.rewind
|
127
165
|
return @src.read
|
128
166
|
end
|
167
|
+
|
168
|
+
# adds mime-parts
|
129
169
|
def attach_contents
|
130
170
|
#prepeare_queue
|
131
171
|
@contents.each{|k,v| @queue.push k}
|
@@ -135,6 +175,11 @@ require 'mime/types'
|
|
135
175
|
@threads.each{|t|t.join}
|
136
176
|
@contents.each{|k,v|self.add_html_content(k)}
|
137
177
|
end
|
178
|
+
|
179
|
+
# helper method to generate proper mime part headers
|
180
|
+
#
|
181
|
+
# param [String] cid content ID
|
182
|
+
# return [String] mime-part-text-blob
|
138
183
|
def add_html_content(cid)
|
139
184
|
filename = File.basename(URI(@contents[cid][:uri]).path)
|
140
185
|
@src.puts "--#{@boundary}"
|
@@ -150,142 +195,154 @@ require 'mime/types'
|
|
150
195
|
end
|
151
196
|
end
|
152
197
|
|
198
|
+
# self-containing data-uri based html
|
199
|
+
class DataUriHtmlGenerator
|
200
|
+
include GeneratorHelpers
|
201
|
+
|
202
|
+
attr_accessor :conf
|
203
|
+
|
204
|
+
# generate self-containing data-uri based html file (html) file without instantiating a MhtmlGenerator object
|
205
|
+
#
|
206
|
+
# mhtml = WebPageArchiver::DataUriHtmlGenerator.generate("https://rubygems.org/")
|
207
|
+
# open("output.html", "w+"){|f| f.write mhtml }
|
208
|
+
#
|
209
|
+
# @param [String, URI] filename_or_uri to test for
|
210
|
+
# @return [String] text blob containing the result
|
211
|
+
def DataUriHtmlGenerator.generate(filename_or_uri)
|
212
|
+
generateror = DataUriHtmlGenerator.new
|
213
|
+
return generateror.convert(filename_or_uri)
|
214
|
+
end
|
153
215
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
def convert(filename_or_uri)
|
168
|
-
@parser = Nokogiri::HTML(open(filename_or_uri))
|
169
|
-
@parser.search('img').each{|i|
|
170
|
-
uri = i.attr('src');
|
171
|
-
uri = join_uri( filename_or_uri, uri).to_s
|
172
|
-
uid = Digest::MD5.hexdigest(uri)
|
173
|
-
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
174
|
-
i.set_attribute('src',"cid:#{uid}")
|
175
|
-
}
|
176
|
-
#styles
|
177
|
-
@parser.search('link[rel=stylesheet]').each{|i|
|
178
|
-
uri = i.attr('href');
|
179
|
-
uri = join_uri( filename_or_uri, uri)
|
180
|
-
uid = Digest::MD5.hexdigest(uri)
|
181
|
-
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
|
182
|
-
i.set_attribute('href',"cid:#{uid}")
|
183
|
-
}
|
184
|
-
#scripts
|
185
|
-
@parser.search('script').map{ |i|
|
186
|
-
next unless i.attr('src');
|
187
|
-
uri = i.attr('src');
|
188
|
-
uri = join_uri( filename_or_uri, uri)
|
189
|
-
uid = Digest::MD5.hexdigest(uri)
|
190
|
-
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
191
|
-
i.set_attribute('src',"cid:#{uid}")
|
216
|
+
# convert object at uri to self-contained text-file
|
217
|
+
#
|
218
|
+
# @param [String, URI] filename_or_uri to test for
|
219
|
+
# @return [String] text blob containing the result
|
220
|
+
def convert(filename_or_uri)
|
221
|
+
@parser = Nokogiri::HTML(open(filename_or_uri))
|
222
|
+
@parser.search('img').each{|i|
|
223
|
+
uri = i.attr('src');
|
224
|
+
uri = join_uri( filename_or_uri, uri).to_s
|
225
|
+
uid = Digest::MD5.hexdigest(uri)
|
226
|
+
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
227
|
+
i.set_attribute('src',"cid:#{uid}")
|
192
228
|
}
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
229
|
+
#styles
|
230
|
+
@parser.search('link[rel=stylesheet]').each{|i|
|
231
|
+
uri = i.attr('href');
|
232
|
+
uri = join_uri( filename_or_uri, uri)
|
233
|
+
uid = Digest::MD5.hexdigest(uri)
|
234
|
+
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
|
235
|
+
i.set_attribute('href',"cid:#{uid}")
|
236
|
+
}
|
237
|
+
#scripts
|
238
|
+
@parser.search('script').map{ |i|
|
239
|
+
next unless i.attr('src');
|
240
|
+
uri = i.attr('src');
|
241
|
+
uri = join_uri( filename_or_uri, uri)
|
242
|
+
uid = Digest::MD5.hexdigest(uri)
|
243
|
+
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
244
|
+
i.set_attribute('src',"cid:#{uid}")
|
245
|
+
}
|
246
|
+
self.set_contents
|
247
|
+
return @parser.to_s
|
213
248
|
end
|
214
249
|
|
250
|
+
# replaces content-placeholders with actual content
|
251
|
+
def set_contents
|
252
|
+
#prepeare_queue
|
253
|
+
@contents.each{|k,v| @queue.push k}
|
254
|
+
#start download threads
|
255
|
+
self.start_download_thread
|
256
|
+
# wait until download finished.
|
257
|
+
@threads.each{|t|t.join}
|
258
|
+
@contents.each do |k,v|
|
259
|
+
content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
|
260
|
+
tag=v[:parser_ref]
|
261
|
+
attribute=v[:attribute_name]
|
262
|
+
content_type=v[:content_type]
|
263
|
+
tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
215
267
|
|
216
|
-
|
268
|
+
# self-containing all-inline based html
|
269
|
+
class InlineHtmlGenerator
|
270
|
+
include GeneratorHelpers
|
271
|
+
|
272
|
+
attr_accessor :conf
|
273
|
+
|
274
|
+
# generate self-containing all-inline based html file (html) file without instantiating a MhtmlGenerator object
|
217
275
|
#
|
218
276
|
# mhtml = WebPageArchiver::InlineHtmlGenerator.generate("https://rubygems.org/")
|
219
277
|
# open("output.html", "w+"){|f| f.write mhtml }
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
end
|
278
|
+
#
|
279
|
+
# @param [String, URI] filename_or_uri to test for
|
280
|
+
# @return [String] text blob containing the result
|
281
|
+
def InlineHtmlGenerator.generate(filename_or_uri)
|
282
|
+
generator = InlineHtmlGenerator.new
|
283
|
+
return generator.convert(filename_or_uri)
|
284
|
+
end
|
228
285
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
uri = join_uri( filename_or_uri, uri)
|
242
|
-
uid = Digest::MD5.hexdigest(uri)
|
243
|
-
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
|
244
|
-
i.set_attribute('href',"cid:#{uid}")
|
245
|
-
}
|
246
|
-
#scripts
|
247
|
-
@parser.search('script').map{ |i|
|
248
|
-
next unless i.attr('src');
|
249
|
-
uri = i.attr('src');
|
250
|
-
uri = join_uri( filename_or_uri, uri)
|
251
|
-
uid = Digest::MD5.hexdigest(uri)
|
252
|
-
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
253
|
-
i.set_attribute('src',"cid:#{uid}")
|
286
|
+
# convert object at uri to self-contained text-file
|
287
|
+
#
|
288
|
+
# @param [String, URI] filename_or_uri to test for
|
289
|
+
# @return [String] text blob containing the result
|
290
|
+
def convert(filename_or_uri)
|
291
|
+
@parser = Nokogiri::HTML(open(filename_or_uri))
|
292
|
+
@parser.search('img').each{|i|
|
293
|
+
uri = i.attr('src');
|
294
|
+
uri = join_uri( filename_or_uri, uri).to_s
|
295
|
+
uid = Digest::MD5.hexdigest(uri)
|
296
|
+
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
297
|
+
i.set_attribute('src',"cid:#{uid}")
|
254
298
|
}
|
255
|
-
|
256
|
-
|
257
|
-
|
299
|
+
#styles
|
300
|
+
@parser.search('link[rel=stylesheet]').each{|i|
|
301
|
+
uri = i.attr('href');
|
302
|
+
uri = join_uri( filename_or_uri, uri)
|
303
|
+
uid = Digest::MD5.hexdigest(uri)
|
304
|
+
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
|
305
|
+
i.set_attribute('href',"cid:#{uid}")
|
306
|
+
}
|
307
|
+
#scripts
|
308
|
+
@parser.search('script').map{ |i|
|
309
|
+
next unless i.attr('src');
|
310
|
+
uri = i.attr('src');
|
311
|
+
uri = join_uri( filename_or_uri, uri)
|
312
|
+
uid = Digest::MD5.hexdigest(uri)
|
313
|
+
@contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
|
314
|
+
i.set_attribute('src',"cid:#{uid}")
|
315
|
+
}
|
316
|
+
self.set_contents
|
317
|
+
return @parser.to_s
|
318
|
+
end
|
258
319
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
|
284
|
-
|
285
|
-
end
|
320
|
+
def set_contents
|
321
|
+
#prepeare_queue
|
322
|
+
@contents.each{|k,v| @queue.push k}
|
323
|
+
#start download threads
|
324
|
+
self.start_download_thread
|
325
|
+
# wait until download finished.
|
326
|
+
@threads.each{|t|t.join}
|
327
|
+
@contents.each do |k,v|
|
328
|
+
tag=v[:parser_ref]
|
329
|
+
if tag.name == "script"
|
330
|
+
content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
|
331
|
+
attribute=v[:attribute_name]
|
332
|
+
content_type=v[:content_type]
|
333
|
+
tag.content=v[:body]
|
334
|
+
tag.remove_attribute(v[:attribute_name])
|
335
|
+
elsif tag.name == "link" and v[:content_type]="text/css"
|
336
|
+
tag.after("<style type=\"text/css\">#{v[:body]}</style>")
|
337
|
+
tag.remove()
|
338
|
+
else
|
339
|
+
# back to inline for non-script and style files...
|
340
|
+
content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
|
341
|
+
attribute=v[:attribute_name]
|
342
|
+
content_type=v[:content_type]
|
343
|
+
tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
|
286
344
|
end
|
287
345
|
end
|
288
|
-
|
289
346
|
end
|
290
|
-
|
347
|
+
end
|
291
348
|
end
|
@@ -27,4 +27,26 @@ class TestWebPageArchiver < Test::Unit::TestCase
|
|
27
27
|
assert(mhtml.match('<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAIAAAACACAYAAADDPmHLAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAGXRFWHRTb2Z0d2'))
|
28
28
|
end
|
29
29
|
|
30
|
+
|
31
|
+
class JoinUriTestClass
|
32
|
+
include WebPageArchiver::GeneratorHelpers
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_join_uri
|
36
|
+
a = JoinUriTestClass.new
|
37
|
+
assert_equal("http://murb.github.com/web-page-archiver/static/asdf", a.join_uri("http://murb.github.com/web-page-archiver/static/","asdf"))
|
38
|
+
assert_equal("http://murb.github.com/web-page-archiver/asdf", a.join_uri("http://murb.github.com/web-page-archiver/static","asdf"))
|
39
|
+
assert_equal("http://google.com", a.join_uri("http://murb.github.com/web-page-archiver/static","http://google.com"))
|
40
|
+
|
41
|
+
# this test will fail on Windows ...
|
42
|
+
Dir.mkdir 'C:' unless File.exists?('C:')
|
43
|
+
Dir.mkdir 'C:/testingdir/' unless File.exists?('C:/testingdir')
|
44
|
+
assert_equal("C:/test", a.join_uri(File.open("C:/testing", 'w+'),"test"))
|
45
|
+
assert_equal("C:/testingdir/test", a.join_uri(File.open("C:/testingdir/a", 'w+'),"test"))
|
46
|
+
File.delete 'C:/testingdir/a'
|
47
|
+
Dir.rmdir 'C:/testingdir/'
|
48
|
+
File.delete 'C:/testing'
|
49
|
+
Dir.rmdir 'C:'
|
50
|
+
|
51
|
+
end
|
30
52
|
end
|
data/web-page-archiver.gemspec
CHANGED
@@ -4,24 +4,24 @@
|
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
|
-
s.name =
|
8
|
-
s.version = "0.0.
|
7
|
+
s.name = "web-page-archiver"
|
8
|
+
s.version = "0.0.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["takuya", "murb"]
|
12
|
-
s.date =
|
13
|
-
s.description =
|
14
|
-
s.email =
|
12
|
+
s.date = "2013-04-09"
|
13
|
+
s.description = "web page archiver creates self-containing, one file, html or mhtml filese"
|
14
|
+
s.email = "github.com+web-page-archiver@murb.nl"
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE.txt",
|
17
|
-
"README.
|
17
|
+
"README.md"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"Gemfile",
|
22
22
|
"Gemfile.lock",
|
23
23
|
"LICENSE.txt",
|
24
|
-
"README.
|
24
|
+
"README.md",
|
25
25
|
"Rakefile",
|
26
26
|
"VERSION",
|
27
27
|
"fixtures/image.png",
|
@@ -35,38 +35,34 @@ Gem::Specification.new do |s|
|
|
35
35
|
"test/test_web_page_archiver.rb",
|
36
36
|
"web-page-archiver.gemspec"
|
37
37
|
]
|
38
|
-
s.homepage =
|
38
|
+
s.homepage = "http://github.com/murb/web-page-archiver"
|
39
39
|
s.licenses = ["MIT"]
|
40
40
|
s.require_paths = ["lib"]
|
41
|
-
s.rubygems_version =
|
42
|
-
s.summary =
|
41
|
+
s.rubygems_version = "1.8.23"
|
42
|
+
s.summary = "web page archiver creates self-containing, one file, html or mhtml files"
|
43
43
|
|
44
44
|
if s.respond_to? :specification_version then
|
45
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
46
45
|
s.specification_version = 3
|
47
46
|
|
48
|
-
if Gem::Version.new(Gem::
|
47
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
49
48
|
s.add_runtime_dependency(%q<mime-types>, [">= 0"])
|
49
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
50
50
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
51
|
-
s.add_development_dependency(%q<bundler>, ["~> 1
|
52
|
-
s.add_development_dependency(%q<jeweler>, ["~> 1
|
53
|
-
s.add_development_dependency(%q<rcov>, [">= 0"])
|
54
|
-
s.add_development_dependency(%q<nokogiri>, [">= 0"])
|
51
|
+
s.add_development_dependency(%q<bundler>, ["~> 1"])
|
52
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1"])
|
55
53
|
else
|
56
54
|
s.add_dependency(%q<mime-types>, [">= 0"])
|
57
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
58
|
-
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
59
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
|
60
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
61
55
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
56
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
57
|
+
s.add_dependency(%q<bundler>, ["~> 1"])
|
58
|
+
s.add_dependency(%q<jeweler>, ["~> 1"])
|
62
59
|
end
|
63
60
|
else
|
64
61
|
s.add_dependency(%q<mime-types>, [">= 0"])
|
65
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
66
|
-
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
67
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
|
68
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
69
62
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
63
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
64
|
+
s.add_dependency(%q<bundler>, ["~> 1"])
|
65
|
+
s.add_dependency(%q<jeweler>, ["~> 1"])
|
70
66
|
end
|
71
67
|
end
|
72
68
|
|
metadata
CHANGED
@@ -1,114 +1,110 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: web-page-archiver
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 0
|
8
|
-
- 4
|
9
|
-
version: 0.0.4
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.6
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- takuya
|
13
9
|
- murb
|
14
10
|
autorequire:
|
15
11
|
bindir: bin
|
16
12
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
dependencies:
|
21
|
-
- !ruby/object:Gem::Dependency
|
13
|
+
date: 2013-04-09 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
22
16
|
name: mime-types
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
23
24
|
prerelease: false
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ! '>='
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '0'
|
31
|
+
- !ruby/object:Gem::Dependency
|
32
|
+
name: nokogiri
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
31
39
|
type: :runtime
|
32
|
-
version_requirements: *id001
|
33
|
-
- !ruby/object:Gem::Dependency
|
34
|
-
name: shoulda
|
35
40
|
prerelease: false
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: shoulda
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
43
55
|
type: :development
|
44
|
-
version_requirements: *id002
|
45
|
-
- !ruby/object:Gem::Dependency
|
46
|
-
name: bundler
|
47
56
|
prerelease: false
|
48
|
-
|
49
|
-
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: bundler
|
65
|
+
requirement: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
50
68
|
- - ~>
|
51
|
-
- !ruby/object:Gem::Version
|
52
|
-
|
53
|
-
- 1
|
54
|
-
- 0
|
55
|
-
- 0
|
56
|
-
version: 1.0.0
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '1'
|
57
71
|
type: :development
|
58
|
-
version_requirements: *id003
|
59
|
-
- !ruby/object:Gem::Dependency
|
60
|
-
name: jeweler
|
61
72
|
prerelease: false
|
62
|
-
|
63
|
-
|
73
|
+
version_requirements: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
75
|
+
requirements:
|
64
76
|
- - ~>
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '1'
|
79
|
+
- !ruby/object:Gem::Dependency
|
80
|
+
name: jeweler
|
81
|
+
requirement: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ~>
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '1'
|
71
87
|
type: :development
|
72
|
-
version_requirements: *id004
|
73
|
-
- !ruby/object:Gem::Dependency
|
74
|
-
name: rcov
|
75
88
|
prerelease: false
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
version: "0"
|
83
|
-
type: :development
|
84
|
-
version_requirements: *id005
|
85
|
-
- !ruby/object:Gem::Dependency
|
86
|
-
name: nokogiri
|
87
|
-
prerelease: false
|
88
|
-
requirement: &id006 !ruby/object:Gem::Requirement
|
89
|
-
requirements:
|
90
|
-
- - ">="
|
91
|
-
- !ruby/object:Gem::Version
|
92
|
-
segments:
|
93
|
-
- 0
|
94
|
-
version: "0"
|
95
|
-
type: :development
|
96
|
-
version_requirements: *id006
|
89
|
+
version_requirements: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ~>
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '1'
|
97
95
|
description: web page archiver creates self-containing, one file, html or mhtml filese
|
98
96
|
email: github.com+web-page-archiver@murb.nl
|
99
97
|
executables: []
|
100
|
-
|
101
98
|
extensions: []
|
102
|
-
|
103
|
-
extra_rdoc_files:
|
99
|
+
extra_rdoc_files:
|
104
100
|
- LICENSE.txt
|
105
|
-
- README.
|
106
|
-
files:
|
101
|
+
- README.md
|
102
|
+
files:
|
107
103
|
- .document
|
108
104
|
- Gemfile
|
109
105
|
- Gemfile.lock
|
110
106
|
- LICENSE.txt
|
111
|
-
- README.
|
107
|
+
- README.md
|
112
108
|
- Rakefile
|
113
109
|
- VERSION
|
114
110
|
- fixtures/image.png
|
@@ -121,35 +117,30 @@ files:
|
|
121
117
|
- test/helper.rb
|
122
118
|
- test/test_web_page_archiver.rb
|
123
119
|
- web-page-archiver.gemspec
|
124
|
-
has_rdoc: true
|
125
120
|
homepage: http://github.com/murb/web-page-archiver
|
126
|
-
licenses:
|
121
|
+
licenses:
|
127
122
|
- MIT
|
128
123
|
post_install_message:
|
129
124
|
rdoc_options: []
|
130
|
-
|
131
|
-
require_paths:
|
125
|
+
require_paths:
|
132
126
|
- lib
|
133
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
requirements:
|
142
|
-
- -
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
|
145
|
-
- 0
|
146
|
-
version: "0"
|
127
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
133
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
134
|
+
none: false
|
135
|
+
requirements:
|
136
|
+
- - ! '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
147
139
|
requirements: []
|
148
|
-
|
149
140
|
rubyforge_project:
|
150
|
-
rubygems_version: 1.
|
141
|
+
rubygems_version: 1.8.23
|
151
142
|
signing_key:
|
152
143
|
specification_version: 3
|
153
144
|
summary: web page archiver creates self-containing, one file, html or mhtml files
|
154
145
|
test_files: []
|
155
|
-
|
146
|
+
has_rdoc:
|