embed_html 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +6 -0
- data/README.markdown +28 -0
- data/Rakefile +15 -0
- data/bin/eurl +16 -0
- data/embed_html.gemspec +38 -0
- data/lib/embed_html/embeder.rb +50 -0
- data/lib/embed_html.rb +4 -0
- metadata +100 -0
data/Manifest
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
Dependencies
|
2
|
+
============
|
3
|
+
|
4
|
+
* Hpricot
|
5
|
+
* Typhoeus
|
6
|
+
|
7
|
+
Install
|
8
|
+
=======
|
9
|
+
|
10
|
+
> gem install embed_html
|
11
|
+
|
12
|
+
Usage
|
13
|
+
=======
|
14
|
+
|
15
|
+
> eurl _<input-url>_ _<output-file>_
|
16
|
+
|
17
|
+
Example:
|
18
|
+
|
19
|
+
> eurl "http://en.wikipedia.org/wiki/Wiki" wiki.html
|
20
|
+
|
21
|
+
Output:
|
22
|
+
|
23
|
+
> I, [2010-05-20T15:56:58.315769 #26393] INFO -- : downloading url: http://en.wikipedia.org/wiki/Wiki
|
24
|
+
|
25
|
+
> I, [2010-05-20T15:57:02.033068 #26393] INFO -- : done
|
26
|
+
|
27
|
+
The file wiki.html then contains the specified page, with all of the images embeded
|
28
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Rakefile
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rake'
|
4
|
+
require 'echoe'
|
5
|
+
|
6
|
+
Echoe.new('embed_html', '0.1.0') do |p|
|
7
|
+
p.description = "Download and embed images in html using base64 data encoding"
|
8
|
+
p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
9
|
+
p.url = "http://github.com/siuying/embed_html"
|
10
|
+
p.author = "Francis Chong"
|
11
|
+
p.email = "francis@ignition.hk"
|
12
|
+
p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
|
13
|
+
p.runtime_dependencies = ["hpricot", "typhoeus"]
|
14
|
+
end
|
15
|
+
|
data/bin/eurl
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'embed_html'
|
2
|
+
|
3
|
+
url = ARGV[0]
|
4
|
+
file = ARGV[1]
|
5
|
+
|
6
|
+
if url && file
|
7
|
+
log = Logger.new($stdout)
|
8
|
+
log.level = Logger::INFO
|
9
|
+
|
10
|
+
html = EmbedHtml::Embeder.new(url, log).process
|
11
|
+
File.open(file, 'w') {|f| f.write(html)}
|
12
|
+
|
13
|
+
else
|
14
|
+
puts "usage: eurl <URL> <OUTPUT_FILE>"
|
15
|
+
|
16
|
+
end
|
data/embed_html.gemspec
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{embed_html}
|
5
|
+
s.version = "0.1.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Francis Chong"]
|
9
|
+
s.date = %q{2010-05-20}
|
10
|
+
s.default_executable = %q{eurl}
|
11
|
+
s.description = %q{Download and embed images in html using base64 data encoding}
|
12
|
+
s.email = %q{francis@ignition.hk}
|
13
|
+
s.executables = ["eurl"]
|
14
|
+
s.extra_rdoc_files = ["README.markdown", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb"]
|
15
|
+
s.files = ["Manifest", "README.markdown", "Rakefile", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb", "embed_html.gemspec"]
|
16
|
+
s.homepage = %q{http://github.com/siuying/embed_html}
|
17
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Embed_html", "--main", "README.markdown"]
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.rubyforge_project = %q{embed_html}
|
20
|
+
s.rubygems_version = %q{1.3.6}
|
21
|
+
s.summary = %q{Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding}
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
s.add_runtime_dependency(%q<hpricot>, [">= 0"])
|
29
|
+
s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
|
30
|
+
else
|
31
|
+
s.add_dependency(%q<hpricot>, [">= 0"])
|
32
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
33
|
+
end
|
34
|
+
else
|
35
|
+
s.add_dependency(%q<hpricot>, [">= 0"])
|
36
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'hpricot'
|
4
|
+
require 'uri'
|
5
|
+
require 'base64'
|
6
|
+
require 'typhoeus'
|
7
|
+
|
8
|
+
module EmbedHtml
|
9
|
+
class Embeder
|
10
|
+
MAX_CONCURRENCY = 5
|
11
|
+
|
12
|
+
attr_accessor :url
|
13
|
+
attr_accessor :logger
|
14
|
+
|
15
|
+
def initialize(url, logger=Logger.new($stdout))
|
16
|
+
@logger = logger
|
17
|
+
@url = url
|
18
|
+
end
|
19
|
+
|
20
|
+
def process
|
21
|
+
@logger.info "downloading url: #{@url}"
|
22
|
+
html = Typhoeus::Request.get(@url.to_s).body
|
23
|
+
doc = Hpricot(html)
|
24
|
+
|
25
|
+
hydra = Typhoeus::Hydra.new(:max_concurrency => MAX_CONCURRENCY)
|
26
|
+
doc.search("//img").each do |img|
|
27
|
+
begin
|
28
|
+
image_url = URI.join(@url, img.attributes['src'])
|
29
|
+
@logger.debug "queue download image: #{image_url}"
|
30
|
+
|
31
|
+
request = Typhoeus::Request.new(image_url.to_s)
|
32
|
+
request.on_complete do |response|
|
33
|
+
data = response.body
|
34
|
+
type = response.headers_hash["Content-Type"]
|
35
|
+
if data && type
|
36
|
+
data_b64 = Base64.encode64(data)
|
37
|
+
img.attributes['src'] = "data:#{type};base64,#{data_b64}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
hydra.queue request
|
41
|
+
rescue StandardError => e
|
42
|
+
@logger.error "failed downloading image: #{image_url} (#{e.message})"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
hydra.run
|
46
|
+
@logger.info "done"
|
47
|
+
doc.to_html
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/lib/embed_html.rb
ADDED
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: embed_html
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Francis Chong
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-05-20 00:00:00 +08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: hpricot
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :runtime
|
31
|
+
version_requirements: *id001
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: typhoeus
|
34
|
+
prerelease: false
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
segments:
|
40
|
+
- 0
|
41
|
+
version: "0"
|
42
|
+
type: :runtime
|
43
|
+
version_requirements: *id002
|
44
|
+
description: Download and embed images in html using base64 data encoding
|
45
|
+
email: francis@ignition.hk
|
46
|
+
executables:
|
47
|
+
- eurl
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
extra_rdoc_files:
|
51
|
+
- README.markdown
|
52
|
+
- bin/eurl
|
53
|
+
- lib/embed_html.rb
|
54
|
+
- lib/embed_html/embeder.rb
|
55
|
+
files:
|
56
|
+
- Manifest
|
57
|
+
- README.markdown
|
58
|
+
- Rakefile
|
59
|
+
- bin/eurl
|
60
|
+
- lib/embed_html.rb
|
61
|
+
- lib/embed_html/embeder.rb
|
62
|
+
- embed_html.gemspec
|
63
|
+
has_rdoc: true
|
64
|
+
homepage: http://github.com/siuying/embed_html
|
65
|
+
licenses: []
|
66
|
+
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options:
|
69
|
+
- --line-numbers
|
70
|
+
- --inline-source
|
71
|
+
- --title
|
72
|
+
- Embed_html
|
73
|
+
- --main
|
74
|
+
- README.markdown
|
75
|
+
require_paths:
|
76
|
+
- lib
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
segments:
|
82
|
+
- 0
|
83
|
+
version: "0"
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
segments:
|
89
|
+
- 1
|
90
|
+
- 2
|
91
|
+
version: "1.2"
|
92
|
+
requirements: []
|
93
|
+
|
94
|
+
rubyforge_project: embed_html
|
95
|
+
rubygems_version: 1.3.6
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding
|
99
|
+
test_files: []
|
100
|
+
|