embed_html 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +1 -0
- data/Rakefile +2 -2
- data/embed_html.gemspec +13 -12
- data/lib/embed_html/embeder.rb +27 -14
- metadata +55 -59
data/README.markdown
CHANGED
data/Rakefile
CHANGED
@@ -3,13 +3,13 @@ require 'rubygems'
|
|
3
3
|
require 'rake'
|
4
4
|
require 'echoe'
|
5
5
|
|
6
|
-
Echoe.new('embed_html', '0.3.
|
6
|
+
Echoe.new('embed_html', '0.3.2') do |p|
|
7
7
|
p.description = "Download and embed images in html using base64 data encoding"
|
8
8
|
p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
9
9
|
p.url = "http://github.com/siuying/embed_html"
|
10
10
|
p.author = "Francis Chong"
|
11
11
|
p.email = "francis@ignition.hk"
|
12
12
|
p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
|
13
|
-
p.runtime_dependencies = ["hpricot", "mime-types"]
|
13
|
+
p.runtime_dependencies = ["hpricot", "typhoeus", "mime-types"]
|
14
14
|
end
|
15
15
|
|
data/embed_html.gemspec
CHANGED
@@ -1,38 +1,39 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
|
-
s.name =
|
5
|
-
s.version = "0.3.
|
4
|
+
s.name = "embed_html"
|
5
|
+
s.version = "0.3.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Francis Chong"]
|
9
|
-
s.date =
|
10
|
-
s.
|
11
|
-
s.
|
12
|
-
s.email = %q{francis@ignition.hk}
|
9
|
+
s.date = "2012-02-01"
|
10
|
+
s.description = "Download and embed images in html using base64 data encoding"
|
11
|
+
s.email = "francis@ignition.hk"
|
13
12
|
s.executables = ["eurl"]
|
14
13
|
s.extra_rdoc_files = ["README.markdown", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb"]
|
15
14
|
s.files = ["Manifest", "README.markdown", "Rakefile", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb", "embed_html.gemspec"]
|
16
|
-
s.homepage =
|
15
|
+
s.homepage = "http://github.com/siuying/embed_html"
|
17
16
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Embed_html", "--main", "README.markdown"]
|
18
17
|
s.require_paths = ["lib"]
|
19
|
-
s.rubyforge_project =
|
20
|
-
s.rubygems_version =
|
21
|
-
s.summary =
|
18
|
+
s.rubyforge_project = "embed_html"
|
19
|
+
s.rubygems_version = "1.8.10"
|
20
|
+
s.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
22
21
|
|
23
22
|
if s.respond_to? :specification_version then
|
24
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
23
|
s.specification_version = 3
|
26
24
|
|
27
|
-
if Gem::Version.new(Gem::
|
25
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
28
26
|
s.add_runtime_dependency(%q<hpricot>, [">= 0"])
|
27
|
+
s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
|
29
28
|
s.add_runtime_dependency(%q<mime-types>, [">= 0"])
|
30
29
|
else
|
31
30
|
s.add_dependency(%q<hpricot>, [">= 0"])
|
31
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
32
32
|
s.add_dependency(%q<mime-types>, [">= 0"])
|
33
33
|
end
|
34
34
|
else
|
35
35
|
s.add_dependency(%q<hpricot>, [">= 0"])
|
36
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
36
37
|
s.add_dependency(%q<mime-types>, [">= 0"])
|
37
38
|
end
|
38
39
|
end
|
data/lib/embed_html/embeder.rb
CHANGED
@@ -10,25 +10,30 @@ module EmbedHtml
|
|
10
10
|
class Embeder
|
11
11
|
MAX_CONCURRENCY = 5
|
12
12
|
|
13
|
-
attr_accessor :
|
13
|
+
attr_accessor :url_or_html
|
14
14
|
attr_accessor :logger
|
15
15
|
attr_accessor :concurrency
|
16
|
+
attr_accessor :base_dirname
|
16
17
|
|
17
|
-
def initialize(
|
18
|
+
def initialize(url_or_html, logger=Logger.new($stdout), concurrency=MAX_CONCURRENCY)
|
18
19
|
@logger = logger
|
19
|
-
@
|
20
|
+
@url_or_html = url_or_html
|
20
21
|
@concurrency = concurrency
|
21
22
|
end
|
22
23
|
|
23
24
|
def process
|
24
|
-
@logger.info "downloading url: #{@
|
25
|
-
html = Typhoeus::Request.get(@
|
25
|
+
# @logger.info "downloading url: #{@url_or_html}"
|
26
|
+
html = (@url_or_html =~ /$http/) ? Typhoeus::Request.get(@url_or_html.to_s).body : @url_or_html
|
26
27
|
doc = Hpricot(html)
|
27
28
|
|
28
29
|
hydra = Typhoeus::Hydra.new(:max_concurrency => @concurrency)
|
29
30
|
doc.search("//img").each do |img|
|
30
31
|
begin
|
31
|
-
|
32
|
+
if img['src']=~ /^http/
|
33
|
+
hydra.queue create_fetch_file_request(img, 'src')
|
34
|
+
else
|
35
|
+
fetch_file(img, 'src')
|
36
|
+
end
|
32
37
|
rescue StandardError => e
|
33
38
|
@logger.error "failed download image: #{img['src']} #{e.inspect}"
|
34
39
|
end
|
@@ -36,8 +41,10 @@ module EmbedHtml
|
|
36
41
|
|
37
42
|
doc.search("//script").each do |script|
|
38
43
|
begin
|
39
|
-
if script['src']
|
44
|
+
if script['src'] and script['src'] =~ /^http/
|
40
45
|
hydra.queue create_fetch_file_request(script, 'src')
|
46
|
+
elsif script['src']
|
47
|
+
fetch_file(script, 'src')
|
41
48
|
end
|
42
49
|
rescue StandardError => e
|
43
50
|
@logger.error "failed download script: #{script['src']} #{e.inspect}"
|
@@ -46,7 +53,12 @@ module EmbedHtml
|
|
46
53
|
|
47
54
|
doc.search("//link").each do |link|
|
48
55
|
begin
|
49
|
-
|
56
|
+
url = link['href']
|
57
|
+
if url =~ /^http/
|
58
|
+
hydra.queue create_fetch_file_request(link, 'href')
|
59
|
+
else
|
60
|
+
fetch_file(link, 'href')
|
61
|
+
end
|
50
62
|
rescue StandardError => e
|
51
63
|
@logger.error "failed download linked resource: #{link['href']} #{e.inspect}"
|
52
64
|
end
|
@@ -54,13 +66,13 @@ module EmbedHtml
|
|
54
66
|
|
55
67
|
hydra.run
|
56
68
|
|
57
|
-
@logger.info "done"
|
69
|
+
# @logger.info "done"
|
58
70
|
doc.to_html
|
59
71
|
end
|
60
72
|
|
61
73
|
def process_local
|
62
|
-
@logger.info "downloading url: #{@
|
63
|
-
html = open(@
|
74
|
+
# @logger.info "downloading url: #{@url_or_html}"
|
75
|
+
html = open(@url_or_html).read
|
64
76
|
doc = Hpricot(html)
|
65
77
|
|
66
78
|
doc.search("//img").each do |img|
|
@@ -87,13 +99,13 @@ module EmbedHtml
|
|
87
99
|
end
|
88
100
|
end
|
89
101
|
|
90
|
-
@logger.info "done"
|
102
|
+
# @logger.info "done"
|
91
103
|
doc.to_html
|
92
104
|
end
|
93
105
|
|
94
106
|
private
|
95
107
|
def create_fetch_file_request(element, field)
|
96
|
-
file_url = URI.join(@
|
108
|
+
file_url = (@url_or_html =~ /^http/) ? URI.join(@url_or_html, element.attributes[field]) : element.attributes[field]
|
97
109
|
@logger.debug "queue download file: #{file_url}"
|
98
110
|
|
99
111
|
request = Typhoeus::Request.new(file_url.to_s)
|
@@ -109,8 +121,9 @@ module EmbedHtml
|
|
109
121
|
end
|
110
122
|
|
111
123
|
def fetch_file(element, field)
|
112
|
-
file_url = element.attributes[field]
|
124
|
+
file_url = @base_dirname ? "#{@base_dirname.to_s}/#{element.attributes[field]}" : element.attributes[field]
|
113
125
|
@logger.debug "queue download file: #{file_url}"
|
126
|
+
return unless File.exists?(file_url)
|
114
127
|
|
115
128
|
type = MIME::Types.type_for(file_url).first.to_s rescue "application/data"
|
116
129
|
data = open(file_url.to_s).read
|
metadata
CHANGED
@@ -1,58 +1,60 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: embed_html
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 3
|
8
|
-
- 0
|
9
|
-
version: 0.3.0
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.2
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- Francis Chong
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-02-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: hpricot
|
16
|
+
requirement: &70320708644640 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
22
23
|
prerelease: false
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
24
|
+
version_requirements: *70320708644640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: typhoeus
|
27
|
+
requirement: &70320708641840 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
30
33
|
type: :runtime
|
31
|
-
version_requirements: *id001
|
32
|
-
- !ruby/object:Gem::Dependency
|
33
|
-
name: mime-types
|
34
34
|
prerelease: false
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
35
|
+
version_requirements: *70320708641840
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: mime-types
|
38
|
+
requirement: &70320708641160 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
42
44
|
type: :runtime
|
43
|
-
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70320708641160
|
44
47
|
description: Download and embed images in html using base64 data encoding
|
45
48
|
email: francis@ignition.hk
|
46
|
-
executables:
|
49
|
+
executables:
|
47
50
|
- eurl
|
48
51
|
extensions: []
|
49
|
-
|
50
|
-
extra_rdoc_files:
|
52
|
+
extra_rdoc_files:
|
51
53
|
- README.markdown
|
52
54
|
- bin/eurl
|
53
55
|
- lib/embed_html.rb
|
54
56
|
- lib/embed_html/embeder.rb
|
55
|
-
files:
|
57
|
+
files:
|
56
58
|
- Manifest
|
57
59
|
- README.markdown
|
58
60
|
- Rakefile
|
@@ -60,41 +62,35 @@ files:
|
|
60
62
|
- lib/embed_html.rb
|
61
63
|
- lib/embed_html/embeder.rb
|
62
64
|
- embed_html.gemspec
|
63
|
-
has_rdoc: true
|
64
65
|
homepage: http://github.com/siuying/embed_html
|
65
66
|
licenses: []
|
66
|
-
|
67
67
|
post_install_message:
|
68
|
-
rdoc_options:
|
68
|
+
rdoc_options:
|
69
69
|
- --line-numbers
|
70
70
|
- --inline-source
|
71
71
|
- --title
|
72
72
|
- Embed_html
|
73
73
|
- --main
|
74
74
|
- README.markdown
|
75
|
-
require_paths:
|
75
|
+
require_paths:
|
76
76
|
- lib
|
77
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
requirements:
|
86
|
-
- -
|
87
|
-
- !ruby/object:Gem::Version
|
88
|
-
|
89
|
-
- 1
|
90
|
-
- 2
|
91
|
-
version: "1.2"
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '1.2'
|
92
89
|
requirements: []
|
93
|
-
|
94
90
|
rubyforge_project: embed_html
|
95
|
-
rubygems_version: 1.
|
91
|
+
rubygems_version: 1.8.10
|
96
92
|
signing_key:
|
97
93
|
specification_version: 3
|
98
|
-
summary: Download or process a HTML page, find images there, download them and embed
|
94
|
+
summary: Download or process a HTML page, find images there, download them and embed
|
95
|
+
it into the HTML using Base64 data encoding
|
99
96
|
test_files: []
|
100
|
-
|