seep 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +63 -0
- data/LICENSE.txt +20 -0
- data/README +1 -0
- data/Rakefile +50 -0
- data/doc/small.jpg +0 -0
- data/doc/test_a.html +1785 -0
- data/doc/test_b.html +730 -0
- data/lib/seep.rb +19 -0
- data/lib/seep/doc.rb +44 -0
- data/lib/seep/fetcher.rb +107 -0
- data/lib/seep/image.rb +18 -0
- data/seep.gemspec +85 -0
- data/spec/a_spec.rb +7 -0
- data/spec/doc_spec.rb +42 -0
- data/spec/fetcher_spec.rb +66 -0
- data/spec/image_spec.rb +35 -0
- data/spec/spec_helper.rb +13 -0
- data/spider.rb +226 -0
- data/spidr_test.rb +11 -0
- metadata +172 -0
data/spider.rb
ADDED
@@ -0,0 +1,226 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'pp'
|
6
|
+
require 'singleton'
|
7
|
+
require 'net/http'
|
8
|
+
|
9
|
+
def grab_data_from_uri (link)
|
10
|
+
#uri = link.uri
|
11
|
+
headers = {
|
12
|
+
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
|
13
|
+
}
|
14
|
+
headers['Referer'] = link.prev_uri.to_s if not link.prev_uri.nil?
|
15
|
+
req_path = link.uri.path
|
16
|
+
req_path = req_path + "?#{link.uri.query}" if not link.uri.query.nil?
|
17
|
+
req_path = "/" if req_path == "" or req_path.nil?
|
18
|
+
begin
|
19
|
+
res = Net::HTTP.start(link.uri.host) { |http|
|
20
|
+
http.get(req_path, headers)
|
21
|
+
}
|
22
|
+
rescue Net::HTTPExceptions => e
|
23
|
+
p "something is going wrong"
|
24
|
+
pp e
|
25
|
+
return nil
|
26
|
+
rescue Errno::ETIMEDOUT
|
27
|
+
p "timeout error"
|
28
|
+
return nil
|
29
|
+
rescue Errno::ECONNRESET
|
30
|
+
p "connection reset"
|
31
|
+
return nil
|
32
|
+
rescue EOFError
|
33
|
+
p "end of file error"
|
34
|
+
return nil
|
35
|
+
rescue NoMethodError
|
36
|
+
p "weird error"
|
37
|
+
return nil
|
38
|
+
rescue SystemExit
|
39
|
+
exit
|
40
|
+
rescue
|
41
|
+
p "some other error"
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
#pp res
|
45
|
+
return res.body if (res.is_a? Net::HTTPSuccess)
|
46
|
+
if (res.is_a? Net::HTTPRedirection)
|
47
|
+
p "Redirection detected."
|
48
|
+
if ( not res.header['location'].nil? )
|
49
|
+
begin
|
50
|
+
new_link = SpiderLink.new(URI::join(link.uri.to_s,res.header['location']),link.uri,link.depth)
|
51
|
+
rescue
|
52
|
+
return nil
|
53
|
+
end
|
54
|
+
p "Redirected to: #{new_link.uri.to_s}"
|
55
|
+
SpiderConfig.instance.queued_links << new_link
|
56
|
+
end
|
57
|
+
end
|
58
|
+
return nil
|
59
|
+
end
|
60
|
+
|
61
|
+
def open_link (link)
|
62
|
+
grab_data_from_uri(link)
|
63
|
+
end
|
64
|
+
|
65
|
+
#url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
|
66
|
+
#p open_link(url)
|
67
|
+
|
68
|
+
def get_links(uri,html)
|
69
|
+
return [],[] if html.nil?
|
70
|
+
#begin
|
71
|
+
doc = Nokogiri::HTML.parse(html)
|
72
|
+
#rescue NoMethodError #capture parsing errors
|
73
|
+
# return [],[]
|
74
|
+
#end
|
75
|
+
images = []; links = []
|
76
|
+
doc.search("a").each do |e|
|
77
|
+
link = e.get_attribute("href")
|
78
|
+
#p link
|
79
|
+
begin
|
80
|
+
new_uri = URI::join(uri.to_s,link) if not link.nil?
|
81
|
+
rescue
|
82
|
+
p "Bad URI. Skipping. #{link}"
|
83
|
+
next
|
84
|
+
end
|
85
|
+
if (new_uri.to_s =~ /jpe?g$/i) then
|
86
|
+
images << new_uri
|
87
|
+
else
|
88
|
+
links << new_uri
|
89
|
+
end
|
90
|
+
end
|
91
|
+
return links.uniq, images.uniq
|
92
|
+
end
|
93
|
+
|
94
|
+
#url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
|
95
|
+
#html = open_link(url)
|
96
|
+
#links, images = get_links(url,html)
|
97
|
+
#p "links: "
|
98
|
+
#pp links
|
99
|
+
#p "images: "
|
100
|
+
#pp images
|
101
|
+
|
102
|
+
$base = "/mnt/media/stuff/porn/"
|
103
|
+
def save_image(uri,name,parent)
|
104
|
+
dir = $base + uri.host
|
105
|
+
path = "#{dir}/#{name}.jpg"
|
106
|
+
FileUtils.mkdir_p(dir)
|
107
|
+
res = grab_data_from_uri(SpiderLink.new(uri,parent,0))
|
108
|
+
p "Saving #{uri.to_s} to #{path}"
|
109
|
+
File.open(path,"w").write(res)
|
110
|
+
end
|
111
|
+
|
112
|
+
def random_string(length=10)
|
113
|
+
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
|
114
|
+
password = ''
|
115
|
+
length.times { password << chars[rand(chars.size)] }
|
116
|
+
password
|
117
|
+
end
|
118
|
+
|
119
|
+
#url = "http://galleries.nextdoor-models.com/content/149/thm/01.jpg"
|
120
|
+
#uri = URI::parse(url)
|
121
|
+
#save_image(uri,1001)
|
122
|
+
|
123
|
+
#url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
|
124
|
+
#html = open_link(url)
|
125
|
+
#links, images = get_links(url,html)
|
126
|
+
#batch_id = 1
|
127
|
+
#image_count = 0
|
128
|
+
#images.each do |img|
|
129
|
+
# image_count += 1
|
130
|
+
# save_image(img,"%08s.%08d.jpg" % [ batch_id, image_count ] )
|
131
|
+
#end
|
132
|
+
|
133
|
+
def proc_url(link)
|
134
|
+
html = open_link(link)
|
135
|
+
#p html
|
136
|
+
links, images = get_links(link.uri,html)
|
137
|
+
batch = Time.now.strftime("%Y%m%d.%H%M.%S.") + random_string(4)
|
138
|
+
image_count = 0
|
139
|
+
images.each do |img|
|
140
|
+
image_count += 1
|
141
|
+
save_image(img,"#{batch}.%04d" % image_count, link.uri )
|
142
|
+
end
|
143
|
+
links
|
144
|
+
end
|
145
|
+
|
146
|
+
class SpiderLink
|
147
|
+
attr_accessor :uri, :prev_uri, :depth
|
148
|
+
def initialize ( uri, prev_uri, depth )
|
149
|
+
@uri = uri
|
150
|
+
@prev_uri = prev_uri
|
151
|
+
@depth = depth
|
152
|
+
end
|
153
|
+
def == (other)
|
154
|
+
ret = @uri.to_s == other.to_s
|
155
|
+
p "Compare #{@uri.to_s} to #{other.uri.to_s}: #{ret}" if ret
|
156
|
+
return @uri.to_s == other.uri.to_s
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
class SpiderConfig
|
161
|
+
include Singleton
|
162
|
+
attr_accessor :max_depth, :root_hosts, :queued_links, :seen_links, :max_queue_size
|
163
|
+
def initialize
|
164
|
+
@max_depth = 5
|
165
|
+
@max_queue_size = 100000
|
166
|
+
@root_hosts = []
|
167
|
+
@queued_links = SpiderQueue.new
|
168
|
+
@seen_links = []
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
class SpiderQueue < Hash
|
173
|
+
def initialize
|
174
|
+
super
|
175
|
+
@ordered_queue = []
|
176
|
+
end
|
177
|
+
|
178
|
+
def << (link)
|
179
|
+
if self.has_key?(link.uri)
|
180
|
+
@ordered_queue.delete(link.uri)
|
181
|
+
else
|
182
|
+
self[link.uri] = link
|
183
|
+
end
|
184
|
+
@ordered_queue << link.uri
|
185
|
+
end
|
186
|
+
|
187
|
+
def shift
|
188
|
+
self.delete(@ordered_queue.shift)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def link_loop(start_url)
|
193
|
+
c = SpiderConfig.instance
|
194
|
+
|
195
|
+
start_uri = URI::parse(start_url)
|
196
|
+
c.queued_links << SpiderLink.new(start_uri,nil,0)
|
197
|
+
p c.queued_links
|
198
|
+
c.root_hosts << start_uri.host
|
199
|
+
|
200
|
+
loop do
|
201
|
+
link = c.queued_links.shift
|
202
|
+
if link.nil? then
|
203
|
+
p "Queue is empty. Quitting"
|
204
|
+
break
|
205
|
+
end
|
206
|
+
#if c.seen_links.include?(link.uri) then
|
207
|
+
# p "Already seen #{link.uri.to_s}. Skipping."
|
208
|
+
# next
|
209
|
+
#end
|
210
|
+
#c.seen_links << link.uri
|
211
|
+
new_links = proc_url(link) - (c.seen_links - c.queued_links.keys)
|
212
|
+
p "Q:#{c.queued_links.size} Seen:#{c.seen_links.size} #{link.uri.to_s} - #{new_links.size}"
|
213
|
+
new_links.each do |found_link|
|
214
|
+
next if found_link.nil?
|
215
|
+
#p "Found: #{found_link.to_s}"
|
216
|
+
depth = c.root_hosts.include?(found_link.host) ? link.depth : link.depth + 1
|
217
|
+
unless (depth > c.max_depth or c.queued_links.size > c.max_queue_size)
|
218
|
+
#q_link = SpiderLink.new(found_link,link.uri,depth)
|
219
|
+
c.queued_links << SpiderLink.new(found_link,link.uri,depth)
|
220
|
+
c.seen_links << found_link
|
221
|
+
#p "Added #{found_link.to_s} to queue"
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
data/spidr_test.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spidr'
|
3
|
+
|
4
|
+
p "Spider test!"
|
5
|
+
|
6
|
+
Spidr.site('http://www.definebabe.com/gallery/kme/jana-cova/') do |spider|
|
7
|
+
spider.every_url { |url| puts "url: #{url}"; sleep 0.2 }
|
8
|
+
spider.every_link { |origin,link| puts "link: #{link} (from #{origin})"; sleep 0.2 }
|
9
|
+
spider.every_failed_url { |url| puts "failed: #{url}"; sleep 0.2 }
|
10
|
+
end
|
11
|
+
|
metadata
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: seep
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Carl Zulauf
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-27 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: redis-native_hash
|
16
|
+
requirement: &12752080 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *12752080
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: gd2-ffij
|
27
|
+
requirement: &12751600 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *12751600
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: curb
|
38
|
+
requirement: &12751060 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *12751060
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: &12750520 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *12750520
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: ruby-debug19
|
60
|
+
requirement: &12750040 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *12750040
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: &12749500 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 2.3.0
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *12749500
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: bundler
|
82
|
+
requirement: &12749000 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ~>
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 1.0.0
|
88
|
+
type: :development
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *12749000
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: jeweler
|
93
|
+
requirement: &12611560 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ~>
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 1.6.4
|
99
|
+
type: :development
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *12611560
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: rcov
|
104
|
+
requirement: &12610980 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :development
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *12610980
|
113
|
+
description: Collection of web spidering and downloading tools using redis, curl,
|
114
|
+
and gd.
|
115
|
+
email: carl@linkleaf.com
|
116
|
+
executables: []
|
117
|
+
extensions: []
|
118
|
+
extra_rdoc_files:
|
119
|
+
- LICENSE.txt
|
120
|
+
- README
|
121
|
+
files:
|
122
|
+
- .document
|
123
|
+
- .rspec
|
124
|
+
- Gemfile
|
125
|
+
- Gemfile.lock
|
126
|
+
- LICENSE.txt
|
127
|
+
- README
|
128
|
+
- Rakefile
|
129
|
+
- doc/small.jpg
|
130
|
+
- doc/test_a.html
|
131
|
+
- doc/test_b.html
|
132
|
+
- lib/seep.rb
|
133
|
+
- lib/seep/doc.rb
|
134
|
+
- lib/seep/fetcher.rb
|
135
|
+
- lib/seep/image.rb
|
136
|
+
- seep.gemspec
|
137
|
+
- spec/a_spec.rb
|
138
|
+
- spec/doc_spec.rb
|
139
|
+
- spec/fetcher_spec.rb
|
140
|
+
- spec/image_spec.rb
|
141
|
+
- spec/spec_helper.rb
|
142
|
+
- spider.rb
|
143
|
+
- spidr_test.rb
|
144
|
+
homepage: http://github.com/carlzulauf/seep
|
145
|
+
licenses:
|
146
|
+
- MIT
|
147
|
+
post_install_message:
|
148
|
+
rdoc_options: []
|
149
|
+
require_paths:
|
150
|
+
- lib
|
151
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
152
|
+
none: false
|
153
|
+
requirements:
|
154
|
+
- - ! '>='
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '0'
|
157
|
+
segments:
|
158
|
+
- 0
|
159
|
+
hash: 1561364847905091588
|
160
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
162
|
+
requirements:
|
163
|
+
- - ! '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
requirements: []
|
167
|
+
rubyforge_project:
|
168
|
+
rubygems_version: 1.8.10
|
169
|
+
signing_key:
|
170
|
+
specification_version: 3
|
171
|
+
summary: web spidering/downloading tools
|
172
|
+
test_files: []
|