embed_html 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +6 -0
- data/README.markdown +28 -0
- data/Rakefile +15 -0
- data/bin/eurl +16 -0
- data/embed_html.gemspec +38 -0
- data/lib/embed_html/embeder.rb +50 -0
- data/lib/embed_html.rb +4 -0
- metadata +100 -0
data/Manifest
ADDED
data/README.markdown
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Dependencies
|
|
2
|
+
============
|
|
3
|
+
|
|
4
|
+
* Hpricot
|
|
5
|
+
* Typhoeus
|
|
6
|
+
|
|
7
|
+
Install
|
|
8
|
+
=======
|
|
9
|
+
|
|
10
|
+
> gem install embed_html
|
|
11
|
+
|
|
12
|
+
Usage
|
|
13
|
+
=======
|
|
14
|
+
|
|
15
|
+
> eurl _<input-url>_ _<output-file>_
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
|
|
19
|
+
> eurl "http://en.wikipedia.org/wiki/Wiki" wiki.html
|
|
20
|
+
|
|
21
|
+
Output:
|
|
22
|
+
|
|
23
|
+
> I, [2010-05-20T15:56:58.315769 #26393] INFO -- : downloading url: http://en.wikipedia.org/wiki/Wiki
|
|
24
|
+
|
|
25
|
+
> I, [2010-05-20T15:57:02.033068 #26393] INFO -- : done
|
|
26
|
+
|
|
27
|
+
The file wiki.html then contains the specified page, with all of the images embeded
|
|
28
|
+
|
data/Rakefile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Rakefile
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'rake'
|
|
4
|
+
require 'echoe'
|
|
5
|
+
|
|
6
|
+
Echoe.new('embed_html', '0.1.0') do |p|
|
|
7
|
+
p.description = "Download and embed images in html using base64 data encoding"
|
|
8
|
+
p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
|
9
|
+
p.url = "http://github.com/siuying/embed_html"
|
|
10
|
+
p.author = "Francis Chong"
|
|
11
|
+
p.email = "francis@ignition.hk"
|
|
12
|
+
p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
|
|
13
|
+
p.runtime_dependencies = ["hpricot", "typhoeus"]
|
|
14
|
+
end
|
|
15
|
+
|
data/bin/eurl
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require 'embed_html'
|
|
2
|
+
|
|
3
|
+
url = ARGV[0]
|
|
4
|
+
file = ARGV[1]
|
|
5
|
+
|
|
6
|
+
if url && file
|
|
7
|
+
log = Logger.new($stdout)
|
|
8
|
+
log.level = Logger::INFO
|
|
9
|
+
|
|
10
|
+
html = EmbedHtml::Embeder.new(url, log).process
|
|
11
|
+
File.open(file, 'w') {|f| f.write(html)}
|
|
12
|
+
|
|
13
|
+
else
|
|
14
|
+
puts "usage: eurl <URL> <OUTPUT_FILE>"
|
|
15
|
+
|
|
16
|
+
end
|
data/embed_html.gemspec
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |s|
|
|
4
|
+
s.name = %q{embed_html}
|
|
5
|
+
s.version = "0.1.0"
|
|
6
|
+
|
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
|
8
|
+
s.authors = ["Francis Chong"]
|
|
9
|
+
s.date = %q{2010-05-20}
|
|
10
|
+
s.default_executable = %q{eurl}
|
|
11
|
+
s.description = %q{Download and embed images in html using base64 data encoding}
|
|
12
|
+
s.email = %q{francis@ignition.hk}
|
|
13
|
+
s.executables = ["eurl"]
|
|
14
|
+
s.extra_rdoc_files = ["README.markdown", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb"]
|
|
15
|
+
s.files = ["Manifest", "README.markdown", "Rakefile", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb", "embed_html.gemspec"]
|
|
16
|
+
s.homepage = %q{http://github.com/siuying/embed_html}
|
|
17
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Embed_html", "--main", "README.markdown"]
|
|
18
|
+
s.require_paths = ["lib"]
|
|
19
|
+
s.rubyforge_project = %q{embed_html}
|
|
20
|
+
s.rubygems_version = %q{1.3.6}
|
|
21
|
+
s.summary = %q{Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding}
|
|
22
|
+
|
|
23
|
+
if s.respond_to? :specification_version then
|
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
|
25
|
+
s.specification_version = 3
|
|
26
|
+
|
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
|
28
|
+
s.add_runtime_dependency(%q<hpricot>, [">= 0"])
|
|
29
|
+
s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
|
|
30
|
+
else
|
|
31
|
+
s.add_dependency(%q<hpricot>, [">= 0"])
|
|
32
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
|
33
|
+
end
|
|
34
|
+
else
|
|
35
|
+
s.add_dependency(%q<hpricot>, [">= 0"])
|
|
36
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
require 'logger'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
require 'hpricot'
|
|
4
|
+
require 'uri'
|
|
5
|
+
require 'base64'
|
|
6
|
+
require 'typhoeus'
|
|
7
|
+
|
|
8
|
+
module EmbedHtml
|
|
9
|
+
class Embeder
|
|
10
|
+
MAX_CONCURRENCY = 5
|
|
11
|
+
|
|
12
|
+
attr_accessor :url
|
|
13
|
+
attr_accessor :logger
|
|
14
|
+
|
|
15
|
+
def initialize(url, logger=Logger.new($stdout))
|
|
16
|
+
@logger = logger
|
|
17
|
+
@url = url
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def process
|
|
21
|
+
@logger.info "downloading url: #{@url}"
|
|
22
|
+
html = Typhoeus::Request.get(@url.to_s).body
|
|
23
|
+
doc = Hpricot(html)
|
|
24
|
+
|
|
25
|
+
hydra = Typhoeus::Hydra.new(:max_concurrency => MAX_CONCURRENCY)
|
|
26
|
+
doc.search("//img").each do |img|
|
|
27
|
+
begin
|
|
28
|
+
image_url = URI.join(@url, img.attributes['src'])
|
|
29
|
+
@logger.debug "queue download image: #{image_url}"
|
|
30
|
+
|
|
31
|
+
request = Typhoeus::Request.new(image_url.to_s)
|
|
32
|
+
request.on_complete do |response|
|
|
33
|
+
data = response.body
|
|
34
|
+
type = response.headers_hash["Content-Type"]
|
|
35
|
+
if data && type
|
|
36
|
+
data_b64 = Base64.encode64(data)
|
|
37
|
+
img.attributes['src'] = "data:#{type};base64,#{data_b64}"
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
hydra.queue request
|
|
41
|
+
rescue StandardError => e
|
|
42
|
+
@logger.error "failed downloading image: #{image_url} (#{e.message})"
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
hydra.run
|
|
46
|
+
@logger.info "done"
|
|
47
|
+
doc.to_html
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
data/lib/embed_html.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: embed_html
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 1
|
|
8
|
+
- 0
|
|
9
|
+
version: 0.1.0
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- Francis Chong
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2010-05-20 00:00:00 +08:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies:
|
|
20
|
+
- !ruby/object:Gem::Dependency
|
|
21
|
+
name: hpricot
|
|
22
|
+
prerelease: false
|
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
|
+
requirements:
|
|
25
|
+
- - ">="
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
segments:
|
|
28
|
+
- 0
|
|
29
|
+
version: "0"
|
|
30
|
+
type: :runtime
|
|
31
|
+
version_requirements: *id001
|
|
32
|
+
- !ruby/object:Gem::Dependency
|
|
33
|
+
name: typhoeus
|
|
34
|
+
prerelease: false
|
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
segments:
|
|
40
|
+
- 0
|
|
41
|
+
version: "0"
|
|
42
|
+
type: :runtime
|
|
43
|
+
version_requirements: *id002
|
|
44
|
+
description: Download and embed images in html using base64 data encoding
|
|
45
|
+
email: francis@ignition.hk
|
|
46
|
+
executables:
|
|
47
|
+
- eurl
|
|
48
|
+
extensions: []
|
|
49
|
+
|
|
50
|
+
extra_rdoc_files:
|
|
51
|
+
- README.markdown
|
|
52
|
+
- bin/eurl
|
|
53
|
+
- lib/embed_html.rb
|
|
54
|
+
- lib/embed_html/embeder.rb
|
|
55
|
+
files:
|
|
56
|
+
- Manifest
|
|
57
|
+
- README.markdown
|
|
58
|
+
- Rakefile
|
|
59
|
+
- bin/eurl
|
|
60
|
+
- lib/embed_html.rb
|
|
61
|
+
- lib/embed_html/embeder.rb
|
|
62
|
+
- embed_html.gemspec
|
|
63
|
+
has_rdoc: true
|
|
64
|
+
homepage: http://github.com/siuying/embed_html
|
|
65
|
+
licenses: []
|
|
66
|
+
|
|
67
|
+
post_install_message:
|
|
68
|
+
rdoc_options:
|
|
69
|
+
- --line-numbers
|
|
70
|
+
- --inline-source
|
|
71
|
+
- --title
|
|
72
|
+
- Embed_html
|
|
73
|
+
- --main
|
|
74
|
+
- README.markdown
|
|
75
|
+
require_paths:
|
|
76
|
+
- lib
|
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
segments:
|
|
82
|
+
- 0
|
|
83
|
+
version: "0"
|
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - ">="
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
segments:
|
|
89
|
+
- 1
|
|
90
|
+
- 2
|
|
91
|
+
version: "1.2"
|
|
92
|
+
requirements: []
|
|
93
|
+
|
|
94
|
+
rubyforge_project: embed_html
|
|
95
|
+
rubygems_version: 1.3.6
|
|
96
|
+
signing_key:
|
|
97
|
+
specification_version: 3
|
|
98
|
+
summary: Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding
|
|
99
|
+
test_files: []
|
|
100
|
+
|