idl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/bin/idl +19 -0
  2. data/lib/idl.rb +79 -0
  3. metadata +114 -0
data/bin/idl ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'docopt'
4
+ require 'idl'
5
+
6
+ options = Docopt::docopt """
7
+ IDl - Download all images from a webpage at given URL
8
+
9
+ Usage:
10
+ idl [options] <URL> [directory]
11
+
12
+ Options:
13
+ -h --help Print this help message and exit
14
+ -l --linked Download linked-to images instead of included ones
15
+ """
16
+
17
+ if options['<URL>']
18
+ IDl.new.harvest options['<URL>'], options['--linked'], options['directory'] || './'
19
+ end
data/lib/idl.rb ADDED
@@ -0,0 +1,79 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'eventmachine'
4
+ require 'em-http-request'
5
+
6
+ module EventMachine
7
+ class HttpClient
8
+ attr_accessor :file
9
+ end
10
+ end
11
+
12
+ class IDl
13
+ def initialize
14
+ @image_extensions = %w(jpg jpeg png gif tif tiff).collect {|ext| ".#{ext}"}
15
+ end
16
+
17
+ def harvest(url, linked=false, target_dir='./')
18
+ doc = Nokogiri.parse open url
19
+
20
+ if linked
21
+ urls = []
22
+ doc.css('a[href]').each do |a|
23
+ if path = URI(a['href']).path and path.downcase.end_with?(*@image_extensions)
24
+ urls << URI(url).merge(a['href']).to_s
25
+ end
26
+ end
27
+ else
28
+ urls = doc.css('img[src]').collect do |img|
29
+ URI(url).merge(img['src']).to_s
30
+ end
31
+ end
32
+
33
+ if urls
34
+ self.fetch urls, target_dir
35
+ end
36
+ end
37
+
38
+ def fetch(urls, target_dir='./')
39
+ EM.run do
40
+ request_pool = EM::MultiRequest.new
41
+
42
+ request_pool.callback do
43
+ puts 'All requests finished.'
44
+ EM.stop
45
+ end
46
+
47
+ urls.uniq.each do |url|
48
+ puts "Enqueuing [#{url}]"
49
+ request = EM::HttpRequest.new(url).get
50
+
51
+ request.stream do |chunk|
52
+ FileUtils.mkdir_p target_dir unless File.exists? target_dir
53
+ request.file = File.open unique_filepath(url, target_dir), 'wb' unless request.file
54
+ request.file.write chunk
55
+ end
56
+
57
+ request.callback do
58
+ puts "Image [#{url}] was downloaded successfully."
59
+ end
60
+
61
+ request_pool.add request.object_id, request
62
+ end
63
+ end
64
+ end
65
+
66
+ def unique_filepath(url, target_dir)
67
+ filename = File.basename URI(url).path
68
+ filepath = target_dir + filename
69
+ ext = File.extname filename
70
+ suffix = 1
71
+
72
+ while File.exists? filepath
73
+ suffix = suffix + 1
74
+ filepath = target_dir + File.basename(filename, ext) + '-' + suffix.to_s + ext
75
+ end
76
+
77
+ return filepath
78
+ end
79
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: idl
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ilya Vassilevsky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-09 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: eventmachine
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: em-http-request
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: docopt
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Provides a class that can download images from webpages and a command-line
79
+ tool
80
+ email: vassilevsky@gmail.com
81
+ executables:
82
+ - idl
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - lib/idl.rb
87
+ - bin/idl
88
+ homepage: https://github.com/vassilevsky/idl
89
+ licenses: []
90
+ post_install_message: Now you can run `idl http://www.example.com/path/to/page.html`
91
+ to download all images from the page to the ./page/ directory.
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '1.9'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ requirements: []
109
+ rubyforge_project:
110
+ rubygems_version: 1.8.24
111
+ signing_key:
112
+ specification_version: 3
113
+ summary: Downloads all images from a webpage in parallel
114
+ test_files: []