idl 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/bin/idl +19 -0
  2. data/lib/idl.rb +79 -0
  3. metadata +114 -0
data/bin/idl ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'docopt'
4
+ require 'idl'
5
+
6
+ options = Docopt::docopt """
7
+ IDl - Download all images from a webpage at given URL
8
+
9
+ Usage:
10
+ idl [options] <URL> [directory]
11
+
12
+ Options:
13
+ -h --help Print this help message and exit
14
+ -l --linked Download linked-to images instead of included ones
15
+ """
16
+
17
+ if options['<URL>']
18
+ IDl.new.harvest options['<URL>'], options['--linked'], options['directory'] || './'
19
+ end
data/lib/idl.rb ADDED
@@ -0,0 +1,79 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'eventmachine'
4
+ require 'em-http-request'
5
+
6
+ module EventMachine
7
+ class HttpClient
8
+ attr_accessor :file
9
+ end
10
+ end
11
+
12
+ class IDl
13
+ def initialize
14
+ @image_extensions = %w(jpg jpeg png gif tif tiff).collect {|ext| ".#{ext}"}
15
+ end
16
+
17
+ def harvest(url, linked=false, target_dir='./')
18
+ doc = Nokogiri.parse open url
19
+
20
+ if linked
21
+ urls = []
22
+ doc.css('a[href]').each do |a|
23
+ if path = URI(a['href']).path and path.downcase.end_with?(*@image_extensions)
24
+ urls << URI(url).merge(a['href']).to_s
25
+ end
26
+ end
27
+ else
28
+ urls = doc.css('img[src]').collect do |img|
29
+ URI(url).merge(img['src']).to_s
30
+ end
31
+ end
32
+
33
+ if urls
34
+ self.fetch urls, target_dir
35
+ end
36
+ end
37
+
38
+ def fetch(urls, target_dir='./')
39
+ EM.run do
40
+ request_pool = EM::MultiRequest.new
41
+
42
+ request_pool.callback do
43
+ puts 'All requests finished.'
44
+ EM.stop
45
+ end
46
+
47
+ urls.uniq.each do |url|
48
+ puts "Enqueuing [#{url}]"
49
+ request = EM::HttpRequest.new(url).get
50
+
51
+ request.stream do |chunk|
52
+ FileUtils.mkdir_p target_dir unless File.exists? target_dir
53
+ request.file = File.open unique_filepath(url, target_dir), 'wb' unless request.file
54
+ request.file.write chunk
55
+ end
56
+
57
+ request.callback do
58
+ puts "Image [#{url}] was downloaded successfully."
59
+ end
60
+
61
+ request_pool.add request.object_id, request
62
+ end
63
+ end
64
+ end
65
+
66
+ def unique_filepath(url, target_dir)
67
+ filename = File.basename URI(url).path
68
+ filepath = target_dir + filename
69
+ ext = File.extname filename
70
+ suffix = 1
71
+
72
+ while File.exists? filepath
73
+ suffix = suffix + 1
74
+ filepath = target_dir + File.basename(filename, ext) + '-' + suffix.to_s + ext
75
+ end
76
+
77
+ return filepath
78
+ end
79
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: idl
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ilya Vassilevsky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-09 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: eventmachine
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: em-http-request
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: docopt
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Provides a class that can download images from webpages and a command-line
79
+ tool
80
+ email: vassilevsky@gmail.com
81
+ executables:
82
+ - idl
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - lib/idl.rb
87
+ - bin/idl
88
+ homepage: https://github.com/vassilevsky/idl
89
+ licenses: []
90
+ post_install_message: Now you can run `idl http://www.example.com/path/to/page.html`
91
+ to download all images from the page to the ./page/ directory.
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '1.9'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ requirements: []
109
+ rubyforge_project:
110
+ rubygems_version: 1.8.24
111
+ signing_key:
112
+ specification_version: 3
113
+ summary: Downloads all images from a webpage in parallel
114
+ test_files: []