embed_html 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.markdown CHANGED
@@ -2,6 +2,7 @@ Dependencies
2
2
  ============
3
3
 
4
4
  * Hpricot
5
+ * Typhoeus
5
6
 
6
7
  Install
7
8
  =======
data/Rakefile CHANGED
@@ -3,13 +3,13 @@ require 'rubygems'
3
3
  require 'rake'
4
4
  require 'echoe'
5
5
 
6
- Echoe.new('embed_html', '0.3.0') do |p|
6
+ Echoe.new('embed_html', '0.3.2') do |p|
7
7
  p.description = "Download and embed images in html using base64 data encoding"
8
8
  p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
9
9
  p.url = "http://github.com/siuying/embed_html"
10
10
  p.author = "Francis Chong"
11
11
  p.email = "francis@ignition.hk"
12
12
  p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
13
- p.runtime_dependencies = ["hpricot", "mime-types"]
13
+ p.runtime_dependencies = ["hpricot", "typhoeus", "mime-types"]
14
14
  end
15
15
 
data/embed_html.gemspec CHANGED
@@ -1,38 +1,39 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  Gem::Specification.new do |s|
4
- s.name = %q{embed_html}
5
- s.version = "0.3.0"
4
+ s.name = "embed_html"
5
+ s.version = "0.3.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Francis Chong"]
9
- s.date = %q{2010-06-25}
10
- s.default_executable = %q{eurl}
11
- s.description = %q{Download and embed images in html using base64 data encoding}
12
- s.email = %q{francis@ignition.hk}
9
+ s.date = "2012-02-01"
10
+ s.description = "Download and embed images in html using base64 data encoding"
11
+ s.email = "francis@ignition.hk"
13
12
  s.executables = ["eurl"]
14
13
  s.extra_rdoc_files = ["README.markdown", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb"]
15
14
  s.files = ["Manifest", "README.markdown", "Rakefile", "bin/eurl", "lib/embed_html.rb", "lib/embed_html/embeder.rb", "embed_html.gemspec"]
16
- s.homepage = %q{http://github.com/siuying/embed_html}
15
+ s.homepage = "http://github.com/siuying/embed_html"
17
16
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Embed_html", "--main", "README.markdown"]
18
17
  s.require_paths = ["lib"]
19
- s.rubyforge_project = %q{embed_html}
20
- s.rubygems_version = %q{1.3.6}
21
- s.summary = %q{Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding}
18
+ s.rubyforge_project = "embed_html"
19
+ s.rubygems_version = "1.8.10"
20
+ s.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
22
21
 
23
22
  if s.respond_to? :specification_version then
24
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
23
  s.specification_version = 3
26
24
 
27
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
25
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
28
26
  s.add_runtime_dependency(%q<hpricot>, [">= 0"])
27
+ s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
29
28
  s.add_runtime_dependency(%q<mime-types>, [">= 0"])
30
29
  else
31
30
  s.add_dependency(%q<hpricot>, [">= 0"])
31
+ s.add_dependency(%q<typhoeus>, [">= 0"])
32
32
  s.add_dependency(%q<mime-types>, [">= 0"])
33
33
  end
34
34
  else
35
35
  s.add_dependency(%q<hpricot>, [">= 0"])
36
+ s.add_dependency(%q<typhoeus>, [">= 0"])
36
37
  s.add_dependency(%q<mime-types>, [">= 0"])
37
38
  end
38
39
  end
@@ -10,25 +10,30 @@ module EmbedHtml
10
10
  class Embeder
11
11
  MAX_CONCURRENCY = 5
12
12
 
13
- attr_accessor :url
13
+ attr_accessor :url_or_html
14
14
  attr_accessor :logger
15
15
  attr_accessor :concurrency
16
+ attr_accessor :base_dirname
16
17
 
17
- def initialize(url, logger=Logger.new($stdout), concurrency=MAX_CONCURRENCY)
18
+ def initialize(url_or_html, logger=Logger.new($stdout), concurrency=MAX_CONCURRENCY)
18
19
  @logger = logger
19
- @url = url
20
+ @url_or_html = url_or_html
20
21
  @concurrency = concurrency
21
22
  end
22
23
 
23
24
  def process
24
- @logger.info "downloading url: #{@url}"
25
- html = Typhoeus::Request.get(@url.to_s).body
25
+ # @logger.info "downloading url: #{@url_or_html}"
26
+ html = (@url_or_html =~ /$http/) ? Typhoeus::Request.get(@url_or_html.to_s).body : @url_or_html
26
27
  doc = Hpricot(html)
27
28
 
28
29
  hydra = Typhoeus::Hydra.new(:max_concurrency => @concurrency)
29
30
  doc.search("//img").each do |img|
30
31
  begin
31
- hydra.queue create_fetch_file_request(img, 'src')
32
+ if img['src']=~ /^http/
33
+ hydra.queue create_fetch_file_request(img, 'src')
34
+ else
35
+ fetch_file(img, 'src')
36
+ end
32
37
  rescue StandardError => e
33
38
  @logger.error "failed download image: #{img['src']} #{e.inspect}"
34
39
  end
@@ -36,8 +41,10 @@ module EmbedHtml
36
41
 
37
42
  doc.search("//script").each do |script|
38
43
  begin
39
- if script['src']
44
+ if script['src'] and script['src'] =~ /^http/
40
45
  hydra.queue create_fetch_file_request(script, 'src')
46
+ elsif script['src']
47
+ fetch_file(script, 'src')
41
48
  end
42
49
  rescue StandardError => e
43
50
  @logger.error "failed download script: #{script['src']} #{e.inspect}"
@@ -46,7 +53,12 @@ module EmbedHtml
46
53
 
47
54
  doc.search("//link").each do |link|
48
55
  begin
49
- hydra.queue create_fetch_file_request(link, 'href')
56
+ url = link['href']
57
+ if url =~ /^http/
58
+ hydra.queue create_fetch_file_request(link, 'href')
59
+ else
60
+ fetch_file(link, 'href')
61
+ end
50
62
  rescue StandardError => e
51
63
  @logger.error "failed download linked resource: #{link['href']} #{e.inspect}"
52
64
  end
@@ -54,13 +66,13 @@ module EmbedHtml
54
66
 
55
67
  hydra.run
56
68
 
57
- @logger.info "done"
69
+ # @logger.info "done"
58
70
  doc.to_html
59
71
  end
60
72
 
61
73
  def process_local
62
- @logger.info "downloading url: #{@url}"
63
- html = open(@url).read
74
+ # @logger.info "downloading url: #{@url_or_html}"
75
+ html = open(@url_or_html).read
64
76
  doc = Hpricot(html)
65
77
 
66
78
  doc.search("//img").each do |img|
@@ -87,13 +99,13 @@ module EmbedHtml
87
99
  end
88
100
  end
89
101
 
90
- @logger.info "done"
102
+ # @logger.info "done"
91
103
  doc.to_html
92
104
  end
93
105
 
94
106
  private
95
107
  def create_fetch_file_request(element, field)
96
- file_url = URI.join(@url, element.attributes[field])
108
+ file_url = (@url_or_html =~ /^http/) ? URI.join(@url_or_html, element.attributes[field]) : element.attributes[field]
97
109
  @logger.debug "queue download file: #{file_url}"
98
110
 
99
111
  request = Typhoeus::Request.new(file_url.to_s)
@@ -109,8 +121,9 @@ module EmbedHtml
109
121
  end
110
122
 
111
123
  def fetch_file(element, field)
112
- file_url = element.attributes[field]
124
+ file_url = @base_dirname ? "#{@base_dirname.to_s}/#{element.attributes[field]}" : element.attributes[field]
113
125
  @logger.debug "queue download file: #{file_url}"
126
+ return unless File.exists?(file_url)
114
127
 
115
128
  type = MIME::Types.type_for(file_url).first.to_s rescue "application/data"
116
129
  data = open(file_url.to_s).read
metadata CHANGED
@@ -1,58 +1,60 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: embed_html
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 3
8
- - 0
9
- version: 0.3.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.2
5
+ prerelease:
10
6
  platform: ruby
11
- authors:
7
+ authors:
12
8
  - Francis Chong
13
9
  autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
-
17
- date: 2010-06-25 00:00:00 +08:00
18
- default_executable:
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
12
+ date: 2012-02-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
21
15
  name: hpricot
16
+ requirement: &70320708644640 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
22
23
  prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- requirements:
25
- - - ">="
26
- - !ruby/object:Gem::Version
27
- segments:
28
- - 0
29
- version: "0"
24
+ version_requirements: *70320708644640
25
+ - !ruby/object:Gem::Dependency
26
+ name: typhoeus
27
+ requirement: &70320708641840 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
30
33
  type: :runtime
31
- version_requirements: *id001
32
- - !ruby/object:Gem::Dependency
33
- name: mime-types
34
34
  prerelease: false
35
- requirement: &id002 !ruby/object:Gem::Requirement
36
- requirements:
37
- - - ">="
38
- - !ruby/object:Gem::Version
39
- segments:
40
- - 0
41
- version: "0"
35
+ version_requirements: *70320708641840
36
+ - !ruby/object:Gem::Dependency
37
+ name: mime-types
38
+ requirement: &70320708641160 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
42
44
  type: :runtime
43
- version_requirements: *id002
45
+ prerelease: false
46
+ version_requirements: *70320708641160
44
47
  description: Download and embed images in html using base64 data encoding
45
48
  email: francis@ignition.hk
46
- executables:
49
+ executables:
47
50
  - eurl
48
51
  extensions: []
49
-
50
- extra_rdoc_files:
52
+ extra_rdoc_files:
51
53
  - README.markdown
52
54
  - bin/eurl
53
55
  - lib/embed_html.rb
54
56
  - lib/embed_html/embeder.rb
55
- files:
57
+ files:
56
58
  - Manifest
57
59
  - README.markdown
58
60
  - Rakefile
@@ -60,41 +62,35 @@ files:
60
62
  - lib/embed_html.rb
61
63
  - lib/embed_html/embeder.rb
62
64
  - embed_html.gemspec
63
- has_rdoc: true
64
65
  homepage: http://github.com/siuying/embed_html
65
66
  licenses: []
66
-
67
67
  post_install_message:
68
- rdoc_options:
68
+ rdoc_options:
69
69
  - --line-numbers
70
70
  - --inline-source
71
71
  - --title
72
72
  - Embed_html
73
73
  - --main
74
74
  - README.markdown
75
- require_paths:
75
+ require_paths:
76
76
  - lib
77
- required_ruby_version: !ruby/object:Gem::Requirement
78
- requirements:
79
- - - ">="
80
- - !ruby/object:Gem::Version
81
- segments:
82
- - 0
83
- version: "0"
84
- required_rubygems_version: !ruby/object:Gem::Requirement
85
- requirements:
86
- - - ">="
87
- - !ruby/object:Gem::Version
88
- segments:
89
- - 1
90
- - 2
91
- version: "1.2"
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '1.2'
92
89
  requirements: []
93
-
94
90
  rubyforge_project: embed_html
95
- rubygems_version: 1.3.6
91
+ rubygems_version: 1.8.10
96
92
  signing_key:
97
93
  specification_version: 3
98
- summary: Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding
94
+ summary: Download or process a HTML page, find images there, download them and embed
95
+ it into the HTML using Base64 data encoding
99
96
  test_files: []
100
-