embed_html 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +1 -0
- data/Rakefile +2 -2
- data/embed_html.gemspec +13 -12
- data/lib/embed_html/embeder.rb +27 -14
- metadata +55 -59
data/README.markdown
CHANGED
data/Rakefile
CHANGED
@@ -3,13 +3,13 @@ require 'rubygems'
|
|
3
3
|
require 'rake'
|
4
4
|
require 'echoe'
|
5
5
|
|
6
|
-
Echoe.new('embed_html', '0.3.
|
6
|
+
Echoe.new('embed_html', '0.3.2') do |p|
|
7
7
|
p.description = "Download and embed images in html using base64 data encoding"
|
8
8
|
p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
9
9
|
p.url = "http://github.com/siuying/embed_html"
|
10
10
|
p.author = "Francis Chong"
|
11
11
|
p.email = "francis@ignition.hk"
|
12
12
|
p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
|
13
|
-
p.runtime_dependencies = ["hpricot", "mime-types"]
|
13
|
+
p.runtime_dependencies = ["hpricot", "typhoeus", "mime-types"]
|
14
14
|
end
|
15
15
|
|
data/embed_html.gemspec
CHANGED
@@ -1,38 +1,39 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
|
-
s.name =
|
5
|
-
s.version = "0.3.
|
4
|
+
s.name = "embed_html"
|
5
|
+
s.version = "0.3.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Francis Chong"]
|
9
|
-
s.date =
|
10
|
-
s.
|
11
|
-
s.
|
12
|
-
s.email = %q{francis@ignition.hk}
|
9
|
+
s.date = "2012-02-01"
|
10
|
+
s.description = "Download and embed images in html using base64 data encoding"
|
11
|
+
s.email = "francis@ignition.hk"
|
13
12
|
s.executables = ["eurl"]
|
14
13
|
s.extra_rdoc_files = ["README.markdown", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb"]
|
15
14
|
s.files = ["Manifest", "README.markdown", "Rakefile", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb", "embed_html.gemspec"]
|
16
|
-
s.homepage =
|
15
|
+
s.homepage = "http://github.com/siuying/embed_html"
|
17
16
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Embed_html", "--main", "README.markdown"]
|
18
17
|
s.require_paths = ["lib"]
|
19
|
-
s.rubyforge_project =
|
20
|
-
s.rubygems_version =
|
21
|
-
s.summary =
|
18
|
+
s.rubyforge_project = "embed_html"
|
19
|
+
s.rubygems_version = "1.8.10"
|
20
|
+
s.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
22
21
|
|
23
22
|
if s.respond_to? :specification_version then
|
24
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
23
|
s.specification_version = 3
|
26
24
|
|
27
|
-
if Gem::Version.new(Gem::
|
25
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
28
26
|
s.add_runtime_dependency(%q<hpricot>, [">= 0"])
|
27
|
+
s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
|
29
28
|
s.add_runtime_dependency(%q<mime-types>, [">= 0"])
|
30
29
|
else
|
31
30
|
s.add_dependency(%q<hpricot>, [">= 0"])
|
31
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
32
32
|
s.add_dependency(%q<mime-types>, [">= 0"])
|
33
33
|
end
|
34
34
|
else
|
35
35
|
s.add_dependency(%q<hpricot>, [">= 0"])
|
36
|
+
s.add_dependency(%q<typhoeus>, [">= 0"])
|
36
37
|
s.add_dependency(%q<mime-types>, [">= 0"])
|
37
38
|
end
|
38
39
|
end
|
data/lib/embed_html/embeder.rb
CHANGED
@@ -10,25 +10,30 @@ module EmbedHtml
|
|
10
10
|
class Embeder
|
11
11
|
MAX_CONCURRENCY = 5
|
12
12
|
|
13
|
-
attr_accessor :
|
13
|
+
attr_accessor :url_or_html
|
14
14
|
attr_accessor :logger
|
15
15
|
attr_accessor :concurrency
|
16
|
+
attr_accessor :base_dirname
|
16
17
|
|
17
|
-
def initialize(
|
18
|
+
def initialize(url_or_html, logger=Logger.new($stdout), concurrency=MAX_CONCURRENCY)
|
18
19
|
@logger = logger
|
19
|
-
@
|
20
|
+
@url_or_html = url_or_html
|
20
21
|
@concurrency = concurrency
|
21
22
|
end
|
22
23
|
|
23
24
|
def process
|
24
|
-
@logger.info "downloading url: #{@
|
25
|
-
html = Typhoeus::Request.get(@
|
25
|
+
# @logger.info "downloading url: #{@url_or_html}"
|
26
|
+
html = (@url_or_html =~ /$http/) ? Typhoeus::Request.get(@url_or_html.to_s).body : @url_or_html
|
26
27
|
doc = Hpricot(html)
|
27
28
|
|
28
29
|
hydra = Typhoeus::Hydra.new(:max_concurrency => @concurrency)
|
29
30
|
doc.search("//img").each do |img|
|
30
31
|
begin
|
31
|
-
|
32
|
+
if img['src']=~ /^http/
|
33
|
+
hydra.queue create_fetch_file_request(img, 'src')
|
34
|
+
else
|
35
|
+
fetch_file(img, 'src')
|
36
|
+
end
|
32
37
|
rescue StandardError => e
|
33
38
|
@logger.error "failed download image: #{img['src']} #{e.inspect}"
|
34
39
|
end
|
@@ -36,8 +41,10 @@ module EmbedHtml
|
|
36
41
|
|
37
42
|
doc.search("//script").each do |script|
|
38
43
|
begin
|
39
|
-
if script['src']
|
44
|
+
if script['src'] and script['src'] =~ /^http/
|
40
45
|
hydra.queue create_fetch_file_request(script, 'src')
|
46
|
+
elsif script['src']
|
47
|
+
fetch_file(script, 'src')
|
41
48
|
end
|
42
49
|
rescue StandardError => e
|
43
50
|
@logger.error "failed download script: #{script['src']} #{e.inspect}"
|
@@ -46,7 +53,12 @@ module EmbedHtml
|
|
46
53
|
|
47
54
|
doc.search("//link").each do |link|
|
48
55
|
begin
|
49
|
-
|
56
|
+
url = link['href']
|
57
|
+
if url =~ /^http/
|
58
|
+
hydra.queue create_fetch_file_request(link, 'href')
|
59
|
+
else
|
60
|
+
fetch_file(link, 'href')
|
61
|
+
end
|
50
62
|
rescue StandardError => e
|
51
63
|
@logger.error "failed download linked resource: #{link['href']} #{e.inspect}"
|
52
64
|
end
|
@@ -54,13 +66,13 @@ module EmbedHtml
|
|
54
66
|
|
55
67
|
hydra.run
|
56
68
|
|
57
|
-
@logger.info "done"
|
69
|
+
# @logger.info "done"
|
58
70
|
doc.to_html
|
59
71
|
end
|
60
72
|
|
61
73
|
def process_local
|
62
|
-
@logger.info "downloading url: #{@
|
63
|
-
html = open(@
|
74
|
+
# @logger.info "downloading url: #{@url_or_html}"
|
75
|
+
html = open(@url_or_html).read
|
64
76
|
doc = Hpricot(html)
|
65
77
|
|
66
78
|
doc.search("//img").each do |img|
|
@@ -87,13 +99,13 @@ module EmbedHtml
|
|
87
99
|
end
|
88
100
|
end
|
89
101
|
|
90
|
-
@logger.info "done"
|
102
|
+
# @logger.info "done"
|
91
103
|
doc.to_html
|
92
104
|
end
|
93
105
|
|
94
106
|
private
|
95
107
|
def create_fetch_file_request(element, field)
|
96
|
-
file_url = URI.join(@
|
108
|
+
file_url = (@url_or_html =~ /^http/) ? URI.join(@url_or_html, element.attributes[field]) : element.attributes[field]
|
97
109
|
@logger.debug "queue download file: #{file_url}"
|
98
110
|
|
99
111
|
request = Typhoeus::Request.new(file_url.to_s)
|
@@ -109,8 +121,9 @@ module EmbedHtml
|
|
109
121
|
end
|
110
122
|
|
111
123
|
def fetch_file(element, field)
|
112
|
-
file_url = element.attributes[field]
|
124
|
+
file_url = @base_dirname ? "#{@base_dirname.to_s}/#{element.attributes[field]}" : element.attributes[field]
|
113
125
|
@logger.debug "queue download file: #{file_url}"
|
126
|
+
return unless File.exists?(file_url)
|
114
127
|
|
115
128
|
type = MIME::Types.type_for(file_url).first.to_s rescue "application/data"
|
116
129
|
data = open(file_url.to_s).read
|
metadata
CHANGED
@@ -1,58 +1,60 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: embed_html
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 3
|
8
|
-
- 0
|
9
|
-
version: 0.3.0
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.2
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- Francis Chong
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-02-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: hpricot
|
16
|
+
requirement: &70320708644640 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
22
23
|
prerelease: false
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
24
|
+
version_requirements: *70320708644640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: typhoeus
|
27
|
+
requirement: &70320708641840 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
30
33
|
type: :runtime
|
31
|
-
version_requirements: *id001
|
32
|
-
- !ruby/object:Gem::Dependency
|
33
|
-
name: mime-types
|
34
34
|
prerelease: false
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
35
|
+
version_requirements: *70320708641840
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: mime-types
|
38
|
+
requirement: &70320708641160 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
42
44
|
type: :runtime
|
43
|
-
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70320708641160
|
44
47
|
description: Download and embed images in html using base64 data encoding
|
45
48
|
email: francis@ignition.hk
|
46
|
-
executables:
|
49
|
+
executables:
|
47
50
|
- eurl
|
48
51
|
extensions: []
|
49
|
-
|
50
|
-
extra_rdoc_files:
|
52
|
+
extra_rdoc_files:
|
51
53
|
- README.markdown
|
52
54
|
- bin/eurl
|
53
55
|
- lib/embed_html.rb
|
54
56
|
- lib/embed_html/embeder.rb
|
55
|
-
files:
|
57
|
+
files:
|
56
58
|
- Manifest
|
57
59
|
- README.markdown
|
58
60
|
- Rakefile
|
@@ -60,41 +62,35 @@ files:
|
|
60
62
|
- lib/embed_html.rb
|
61
63
|
- lib/embed_html/embeder.rb
|
62
64
|
- embed_html.gemspec
|
63
|
-
has_rdoc: true
|
64
65
|
homepage: http://github.com/siuying/embed_html
|
65
66
|
licenses: []
|
66
|
-
|
67
67
|
post_install_message:
|
68
|
-
rdoc_options:
|
68
|
+
rdoc_options:
|
69
69
|
- --line-numbers
|
70
70
|
- --inline-source
|
71
71
|
- --title
|
72
72
|
- Embed_html
|
73
73
|
- --main
|
74
74
|
- README.markdown
|
75
|
-
require_paths:
|
75
|
+
require_paths:
|
76
76
|
- lib
|
77
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
requirements:
|
86
|
-
- -
|
87
|
-
- !ruby/object:Gem::Version
|
88
|
-
|
89
|
-
- 1
|
90
|
-
- 2
|
91
|
-
version: "1.2"
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '1.2'
|
92
89
|
requirements: []
|
93
|
-
|
94
90
|
rubyforge_project: embed_html
|
95
|
-
rubygems_version: 1.
|
91
|
+
rubygems_version: 1.8.10
|
96
92
|
signing_key:
|
97
93
|
specification_version: 3
|
98
|
-
summary: Download or process a HTML page, find images there, download them and embed
|
94
|
+
summary: Download or process a HTML page, find images there, download them and embed
|
95
|
+
it into the HTML using Base64 data encoding
|
99
96
|
test_files: []
|
100
|
-
|