embed_html 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest ADDED
@@ -0,0 +1,6 @@
1
+ Manifest
2
+ README.markdown
3
+ Rakefile
4
+ bin/eurl
5
+ lib/embed_html.rb
6
+ lib/embed_html/embeder.rb
data/README.markdown ADDED
@@ -0,0 +1,28 @@
1
+ Dependencies
2
+ ============
3
+
4
+ * Hpricot
5
+ * Typhoeus
6
+
7
+ Install
8
+ =======
9
+
10
+ > gem install embed_html
11
+
12
+ Usage
13
+ =======
14
+
15
+ > eurl _<input-url>_ _<output-file>_
16
+
17
+ Example:
18
+
19
+ > eurl "http://en.wikipedia.org/wiki/Wiki" wiki.html
20
+
21
+ Output:
22
+
23
+ > I, [2010-05-20T15:56:58.315769 #26393] INFO -- : downloading url: http://en.wikipedia.org/wiki/Wiki
24
+
25
+ > I, [2010-05-20T15:57:02.033068 #26393] INFO -- : done
26
+
27
+ The file wiki.html then contains the specified page, with all of the images embeded
28
+
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ # Rakefile
2
+ require 'rubygems'
3
+ require 'rake'
4
+ require 'echoe'
5
+
6
+ Echoe.new('embed_html', '0.1.0') do |p|
7
+ p.description = "Download and embed images in html using base64 data encoding"
8
+ p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
9
+ p.url = "http://github.com/siuying/embed_html"
10
+ p.author = "Francis Chong"
11
+ p.email = "francis@ignition.hk"
12
+ p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
13
+ p.runtime_dependencies = ["hpricot", "typhoeus"]
14
+ end
15
+
data/bin/eurl ADDED
@@ -0,0 +1,16 @@
1
+ require 'embed_html'
2
+
3
+ url = ARGV[0]
4
+ file = ARGV[1]
5
+
6
+ if url && file
7
+ log = Logger.new($stdout)
8
+ log.level = Logger::INFO
9
+
10
+ html = EmbedHtml::Embeder.new(url, log).process
11
+ File.open(file, 'w') {|f| f.write(html)}
12
+
13
+ else
14
+ puts "usage: eurl <URL> <OUTPUT_FILE>"
15
+
16
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{embed_html}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Francis Chong"]
9
+ s.date = %q{2010-05-20}
10
+ s.default_executable = %q{eurl}
11
+ s.description = %q{Download and embed images in html using base64 data encoding}
12
+ s.email = %q{francis@ignition.hk}
13
+ s.executables = ["eurl"]
14
+ s.extra_rdoc_files = ["README.markdown", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb"]
15
+ s.files = ["Manifest", "README.markdown", "Rakefile", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb", "embed_html.gemspec"]
16
+ s.homepage = %q{http://github.com/siuying/embed_html}
17
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Embed_html", "--main", "README.markdown"]
18
+ s.require_paths = ["lib"]
19
+ s.rubyforge_project = %q{embed_html}
20
+ s.rubygems_version = %q{1.3.6}
21
+ s.summary = %q{Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding}
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ s.add_runtime_dependency(%q<hpricot>, [">= 0"])
29
+ s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
30
+ else
31
+ s.add_dependency(%q<hpricot>, [">= 0"])
32
+ s.add_dependency(%q<typhoeus>, [">= 0"])
33
+ end
34
+ else
35
+ s.add_dependency(%q<hpricot>, [">= 0"])
36
+ s.add_dependency(%q<typhoeus>, [">= 0"])
37
+ end
38
+ end
@@ -0,0 +1,50 @@
1
+ require 'logger'
2
+ require 'open-uri'
3
+ require 'hpricot'
4
+ require 'uri'
5
+ require 'base64'
6
+ require 'typhoeus'
7
+
8
+ module EmbedHtml
9
+ class Embeder
10
+ MAX_CONCURRENCY = 5
11
+
12
+ attr_accessor :url
13
+ attr_accessor :logger
14
+
15
+ def initialize(url, logger=Logger.new($stdout))
16
+ @logger = logger
17
+ @url = url
18
+ end
19
+
20
+ def process
21
+ @logger.info "downloading url: #{@url}"
22
+ html = Typhoeus::Request.get(@url.to_s).body
23
+ doc = Hpricot(html)
24
+
25
+ hydra = Typhoeus::Hydra.new(:max_concurrency => MAX_CONCURRENCY)
26
+ doc.search("//img").each do |img|
27
+ begin
28
+ image_url = URI.join(@url, img.attributes['src'])
29
+ @logger.debug "queue download image: #{image_url}"
30
+
31
+ request = Typhoeus::Request.new(image_url.to_s)
32
+ request.on_complete do |response|
33
+ data = response.body
34
+ type = response.headers_hash["Content-Type"]
35
+ if data && type
36
+ data_b64 = Base64.encode64(data)
37
+ img.attributes['src'] = "data:#{type};base64,#{data_b64}"
38
+ end
39
+ end
40
+ hydra.queue request
41
+ rescue StandardError => e
42
+ @logger.error "failed downloading image: #{image_url} (#{e.message})"
43
+ end
44
+ end
45
+ hydra.run
46
+ @logger.info "done"
47
+ doc.to_html
48
+ end
49
+ end
50
+ end
data/lib/embed_html.rb ADDED
@@ -0,0 +1,4 @@
1
+ path = File.dirname(__FILE__)
2
+ $:.unshift(path) unless $:.include?(path)
3
+
4
+ require 'embed_html/embeder'
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embed_html
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Francis Chong
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-05-20 00:00:00 +08:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: hpricot
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: typhoeus
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ version: "0"
42
+ type: :runtime
43
+ version_requirements: *id002
44
+ description: Download and embed images in html using base64 data encoding
45
+ email: francis@ignition.hk
46
+ executables:
47
+ - eurl
48
+ extensions: []
49
+
50
+ extra_rdoc_files:
51
+ - README.markdown
52
+ - bin/eurl
53
+ - lib/embed_html.rb
54
+ - lib/embed_html/embeder.rb
55
+ files:
56
+ - Manifest
57
+ - README.markdown
58
+ - Rakefile
59
+ - bin/eurl
60
+ - lib/embed_html.rb
61
+ - lib/embed_html/embeder.rb
62
+ - embed_html.gemspec
63
+ has_rdoc: true
64
+ homepage: http://github.com/siuying/embed_html
65
+ licenses: []
66
+
67
+ post_install_message:
68
+ rdoc_options:
69
+ - --line-numbers
70
+ - --inline-source
71
+ - --title
72
+ - Embed_html
73
+ - --main
74
+ - README.markdown
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 0
83
+ version: "0"
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ segments:
89
+ - 1
90
+ - 2
91
+ version: "1.2"
92
+ requirements: []
93
+
94
+ rubyforge_project: embed_html
95
+ rubygems_version: 1.3.6
96
+ signing_key:
97
+ specification_version: 3
98
+ summary: Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding
99
+ test_files: []
100
+