idl 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/idl +19 -0
- data/lib/idl.rb +79 -0
- metadata +114 -0
data/bin/idl
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'docopt'
|
4
|
+
require 'idl'
|
5
|
+
|
6
|
+
options = Docopt::docopt """
|
7
|
+
IDl - Download all images from a webpage at given URL
|
8
|
+
|
9
|
+
Usage:
|
10
|
+
idl [options] <URL> [directory]
|
11
|
+
|
12
|
+
Options:
|
13
|
+
-h --help Print this help message and exit
|
14
|
+
-l --linked Download linked-to images instead of included ones
|
15
|
+
"""
|
16
|
+
|
17
|
+
if options['<URL>']
|
18
|
+
IDl.new.harvest options['<URL>'], options['--linked'], options['directory'] || './'
|
19
|
+
end
|
data/lib/idl.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'eventmachine'
|
4
|
+
require 'em-http-request'
|
5
|
+
|
6
|
+
module EventMachine
|
7
|
+
class HttpClient
|
8
|
+
attr_accessor :file
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class IDl
|
13
|
+
def initialize
|
14
|
+
@image_extensions = %w(jpg jpeg png gif tif tiff).collect {|ext| ".#{ext}"}
|
15
|
+
end
|
16
|
+
|
17
|
+
def harvest(url, linked=false, target_dir='./')
|
18
|
+
doc = Nokogiri.parse open url
|
19
|
+
|
20
|
+
if linked
|
21
|
+
urls = []
|
22
|
+
doc.css('a[href]').each do |a|
|
23
|
+
if path = URI(a['href']).path and path.downcase.end_with?(*@image_extensions)
|
24
|
+
urls << URI(url).merge(a['href']).to_s
|
25
|
+
end
|
26
|
+
end
|
27
|
+
else
|
28
|
+
urls = doc.css('img[src]').collect do |img|
|
29
|
+
URI(url).merge(img['src']).to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
if urls
|
34
|
+
self.fetch urls, target_dir
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def fetch(urls, target_dir='./')
|
39
|
+
EM.run do
|
40
|
+
request_pool = EM::MultiRequest.new
|
41
|
+
|
42
|
+
request_pool.callback do
|
43
|
+
puts 'All requests finished.'
|
44
|
+
EM.stop
|
45
|
+
end
|
46
|
+
|
47
|
+
urls.uniq.each do |url|
|
48
|
+
puts "Enqueuing [#{url}]"
|
49
|
+
request = EM::HttpRequest.new(url).get
|
50
|
+
|
51
|
+
request.stream do |chunk|
|
52
|
+
FileUtils.mkdir_p target_dir unless File.exists? target_dir
|
53
|
+
request.file = File.open unique_filepath(url, target_dir), 'wb' unless request.file
|
54
|
+
request.file.write chunk
|
55
|
+
end
|
56
|
+
|
57
|
+
request.callback do
|
58
|
+
puts "Image [#{url}] was downloaded successfully."
|
59
|
+
end
|
60
|
+
|
61
|
+
request_pool.add request.object_id, request
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def unique_filepath(url, target_dir)
|
67
|
+
filename = File.basename URI(url).path
|
68
|
+
filepath = target_dir + filename
|
69
|
+
ext = File.extname filename
|
70
|
+
suffix = 1
|
71
|
+
|
72
|
+
while File.exists? filepath
|
73
|
+
suffix = suffix + 1
|
74
|
+
filepath = target_dir + File.basename(filename, ext) + '-' + suffix.to_s + ext
|
75
|
+
end
|
76
|
+
|
77
|
+
return filepath
|
78
|
+
end
|
79
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: idl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Ilya Vassilevsky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-09 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: eventmachine
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: em-http-request
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: nokogiri
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: docopt
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Provides a class that can download images from webpages and a command-line
|
79
|
+
tool
|
80
|
+
email: vassilevsky@gmail.com
|
81
|
+
executables:
|
82
|
+
- idl
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files: []
|
85
|
+
files:
|
86
|
+
- lib/idl.rb
|
87
|
+
- bin/idl
|
88
|
+
homepage: https://github.com/vassilevsky/idl
|
89
|
+
licenses: []
|
90
|
+
post_install_message: Now you can run `idl http://www.example.com/path/to/page.html`
|
91
|
+
to download all images from the page to the ./page/ directory.
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '1.9'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
none: false
|
104
|
+
requirements:
|
105
|
+
- - ! '>='
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '0'
|
108
|
+
requirements: []
|
109
|
+
rubyforge_project:
|
110
|
+
rubygems_version: 1.8.24
|
111
|
+
signing_key:
|
112
|
+
specification_version: 3
|
113
|
+
summary: Downloads all images from a webpage in parallel
|
114
|
+
test_files: []
|