embed_html 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/bin/eurl +2 -1
- data/embed_html.gemspec +1 -1
- data/lib/embed_html/embeder.rb +10 -6
- metadata +2 -2
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@ require 'rubygems'
|
|
3
3
|
require 'rake'
|
4
4
|
require 'echoe'
|
5
5
|
|
6
|
-
Echoe.new('embed_html', '0.2.
|
6
|
+
Echoe.new('embed_html', '0.2.3') do |p|
|
7
7
|
p.description = "Download and embed images in html using base64 data encoding"
|
8
8
|
p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
|
9
9
|
p.url = "http://github.com/siuying/embed_html"
|
data/bin/eurl
CHANGED
@@ -2,12 +2,13 @@ require 'embed_html'
|
|
2
2
|
|
3
3
|
url = ARGV[0]
|
4
4
|
file = ARGV[1]
|
5
|
+
concurrency = ARGV[2].nil? ? 5 : ARGV[2].to_i
|
5
6
|
|
6
7
|
if url && file
|
7
8
|
log = Logger.new($stdout)
|
8
9
|
log.level = Logger::INFO
|
9
10
|
|
10
|
-
html = EmbedHtml::Embeder.new(url, log).process
|
11
|
+
html = EmbedHtml::Embeder.new(url, log, concurrency).process
|
11
12
|
File.open(file, 'w') {|f| f.write(html)}
|
12
13
|
|
13
14
|
else
|
data/embed_html.gemspec
CHANGED
data/lib/embed_html/embeder.rb
CHANGED
@@ -11,10 +11,12 @@ module EmbedHtml
|
|
11
11
|
|
12
12
|
attr_accessor :url
|
13
13
|
attr_accessor :logger
|
14
|
+
attr_accessor :concurrency
|
14
15
|
|
15
|
-
def initialize(url, logger=Logger.new($stdout))
|
16
|
+
def initialize(url, logger=Logger.new($stdout), concurrency=MAX_CONCURRENCY)
|
16
17
|
@logger = logger
|
17
18
|
@url = url
|
19
|
+
@concurrency = concurrency
|
18
20
|
end
|
19
21
|
|
20
22
|
def process
|
@@ -22,20 +24,22 @@ module EmbedHtml
|
|
22
24
|
html = Typhoeus::Request.get(@url.to_s).body
|
23
25
|
doc = Hpricot(html)
|
24
26
|
|
25
|
-
hydra = Typhoeus::Hydra.new(:max_concurrency =>
|
27
|
+
hydra = Typhoeus::Hydra.new(:max_concurrency => @concurrency)
|
26
28
|
doc.search("//img").each do |img|
|
27
29
|
begin
|
28
30
|
hydra.queue create_fetch_file_request(img, 'src')
|
29
31
|
rescue StandardError => e
|
30
|
-
@logger.error "failed download image: #{img['src']}"
|
32
|
+
@logger.error "failed download image: #{img['src']} #{e.inspect}"
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
36
|
doc.search("//script").each do |script|
|
35
37
|
begin
|
36
|
-
|
38
|
+
if script['src']
|
39
|
+
hydra.queue create_fetch_file_request(script, 'src')
|
40
|
+
end
|
37
41
|
rescue StandardError => e
|
38
|
-
@logger.error "failed download script: #{script['src']}"
|
42
|
+
@logger.error "failed download script: #{script['src']} #{e.inspect}"
|
39
43
|
end
|
40
44
|
end
|
41
45
|
|
@@ -43,7 +47,7 @@ module EmbedHtml
|
|
43
47
|
begin
|
44
48
|
hydra.queue create_fetch_file_request(link, 'href')
|
45
49
|
rescue StandardError => e
|
46
|
-
@logger.error "failed download linked resource: #{link['href']}"
|
50
|
+
@logger.error "failed download linked resource: #{link['href']} #{e.inspect}"
|
47
51
|
end
|
48
52
|
end
|
49
53
|
|