seep 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,226 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'fileutils'
5
+ require 'pp'
6
+ require 'singleton'
7
+ require 'net/http'
8
+
9
+ def grab_data_from_uri (link)
10
+ #uri = link.uri
11
+ headers = {
12
+ 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
13
+ }
14
+ headers['Referer'] = link.prev_uri.to_s if not link.prev_uri.nil?
15
+ req_path = link.uri.path
16
+ req_path = req_path + "?#{link.uri.query}" if not link.uri.query.nil?
17
+ req_path = "/" if req_path == "" or req_path.nil?
18
+ begin
19
+ res = Net::HTTP.start(link.uri.host) { |http|
20
+ http.get(req_path, headers)
21
+ }
22
+ rescue Net::HTTPExceptions => e
23
+ p "something is going wrong"
24
+ pp e
25
+ return nil
26
+ rescue Errno::ETIMEDOUT
27
+ p "timeout error"
28
+ return nil
29
+ rescue Errno::ECONNRESET
30
+ p "connection reset"
31
+ return nil
32
+ rescue EOFError
33
+ p "end of file error"
34
+ return nil
35
+ rescue NoMethodError
36
+ p "weird error"
37
+ return nil
38
+ rescue SystemExit
39
+ exit
40
+ rescue
41
+ p "some other error"
42
+ return nil
43
+ end
44
+ #pp res
45
+ return res.body if (res.is_a? Net::HTTPSuccess)
46
+ if (res.is_a? Net::HTTPRedirection)
47
+ p "Redirection detected."
48
+ if ( not res.header['location'].nil? )
49
+ begin
50
+ new_link = SpiderLink.new(URI::join(link.uri.to_s,res.header['location']),link.uri,link.depth)
51
+ rescue
52
+ return nil
53
+ end
54
+ p "Redirected to: #{new_link.uri.to_s}"
55
+ SpiderConfig.instance.queued_links << new_link
56
+ end
57
+ end
58
+ return nil
59
+ end
60
+
61
+ def open_link (link)
62
+ grab_data_from_uri(link)
63
+ end
64
+
65
+ #url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
66
+ #p open_link(url)
67
+
68
+ def get_links(uri,html)
69
+ return [],[] if html.nil?
70
+ #begin
71
+ doc = Nokogiri::HTML.parse(html)
72
+ #rescue NoMethodError #capture parsing errors
73
+ # return [],[]
74
+ #end
75
+ images = []; links = []
76
+ doc.search("a").each do |e|
77
+ link = e.get_attribute("href")
78
+ #p link
79
+ begin
80
+ new_uri = URI::join(uri.to_s,link) if not link.nil?
81
+ rescue
82
+ p "Bad URI. Skipping. #{link}"
83
+ next
84
+ end
85
+ if (new_uri.to_s =~ /jpe?g$/i) then
86
+ images << new_uri
87
+ else
88
+ links << new_uri
89
+ end
90
+ end
91
+ return links.uniq, images.uniq
92
+ end
93
+
94
+ #url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
95
+ #html = open_link(url)
96
+ #links, images = get_links(url,html)
97
+ #p "links: "
98
+ #pp links
99
+ #p "images: "
100
+ #pp images
101
+
102
+ $base = "/mnt/media/stuff/porn/"
103
+ def save_image(uri,name,parent)
104
+ dir = $base + uri.host
105
+ path = "#{dir}/#{name}.jpg"
106
+ FileUtils.mkdir_p(dir)
107
+ res = grab_data_from_uri(SpiderLink.new(uri,parent,0))
108
+ p "Saving #{uri.to_s} to #{path}"
109
+ File.open(path,"w").write(res)
110
+ end
111
+
112
+ def random_string(length=10)
113
+ chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
114
+ password = ''
115
+ length.times { password << chars[rand(chars.size)] }
116
+ password
117
+ end
118
+
119
+ #url = "http://galleries.nextdoor-models.com/content/149/thm/01.jpg"
120
+ #uri = URI::parse(url)
121
+ #save_image(uri,1001)
122
+
123
+ #url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
124
+ #html = open_link(url)
125
+ #links, images = get_links(url,html)
126
+ #batch_id = 1
127
+ #image_count = 0
128
+ #images.each do |img|
129
+ # image_count += 1
130
+ # save_image(img,"%08s.%08d.jpg" % [ batch_id, image_count ] )
131
+ #end
132
+
133
+ def proc_url(link)
134
+ html = open_link(link)
135
+ #p html
136
+ links, images = get_links(link.uri,html)
137
+ batch = Time.now.strftime("%Y%m%d.%H%M.%S.") + random_string(4)
138
+ image_count = 0
139
+ images.each do |img|
140
+ image_count += 1
141
+ save_image(img,"#{batch}.%04d" % image_count, link.uri )
142
+ end
143
+ links
144
+ end
145
+
146
+ class SpiderLink
147
+ attr_accessor :uri, :prev_uri, :depth
148
+ def initialize ( uri, prev_uri, depth )
149
+ @uri = uri
150
+ @prev_uri = prev_uri
151
+ @depth = depth
152
+ end
153
+ def == (other)
154
+ ret = @uri.to_s == other.to_s
155
+ p "Compare #{@uri.to_s} to #{other.uri.to_s}: #{ret}" if ret
156
+ return @uri.to_s == other.uri.to_s
157
+ end
158
+ end
159
+
160
+ class SpiderConfig
161
+ include Singleton
162
+ attr_accessor :max_depth, :root_hosts, :queued_links, :seen_links, :max_queue_size
163
+ def initialize
164
+ @max_depth = 5
165
+ @max_queue_size = 100000
166
+ @root_hosts = []
167
+ @queued_links = SpiderQueue.new
168
+ @seen_links = []
169
+ end
170
+ end
171
+
172
+ class SpiderQueue < Hash
173
+ def initialize
174
+ super
175
+ @ordered_queue = []
176
+ end
177
+
178
+ def << (link)
179
+ if self.has_key?(link.uri)
180
+ @ordered_queue.delete(link.uri)
181
+ else
182
+ self[link.uri] = link
183
+ end
184
+ @ordered_queue << link.uri
185
+ end
186
+
187
+ def shift
188
+ self.delete(@ordered_queue.shift)
189
+ end
190
+ end
191
+
192
+ def link_loop(start_url)
193
+ c = SpiderConfig.instance
194
+
195
+ start_uri = URI::parse(start_url)
196
+ c.queued_links << SpiderLink.new(start_uri,nil,0)
197
+ p c.queued_links
198
+ c.root_hosts << start_uri.host
199
+
200
+ loop do
201
+ link = c.queued_links.shift
202
+ if link.nil? then
203
+ p "Queue is empty. Quitting"
204
+ break
205
+ end
206
+ #if c.seen_links.include?(link.uri) then
207
+ # p "Already seen #{link.uri.to_s}. Skipping."
208
+ # next
209
+ #end
210
+ #c.seen_links << link.uri
211
+ new_links = proc_url(link) - (c.seen_links - c.queued_links.keys)
212
+ p "Q:#{c.queued_links.size} Seen:#{c.seen_links.size} #{link.uri.to_s} - #{new_links.size}"
213
+ new_links.each do |found_link|
214
+ next if found_link.nil?
215
+ #p "Found: #{found_link.to_s}"
216
+ depth = c.root_hosts.include?(found_link.host) ? link.depth : link.depth + 1
217
+ unless (depth > c.max_depth or c.queued_links.size > c.max_queue_size)
218
+ #q_link = SpiderLink.new(found_link,link.uri,depth)
219
+ c.queued_links << SpiderLink.new(found_link,link.uri,depth)
220
+ c.seen_links << found_link
221
+ #p "Added #{found_link.to_s} to queue"
222
+ end
223
+ end
224
+ end
225
+ end
226
+
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'spidr'
3
+
4
+ p "Spider test!"
5
+
6
+ Spidr.site('http://www.definebabe.com/gallery/kme/jana-cova/') do |spider|
7
+ spider.every_url { |url| puts "url: #{url}"; sleep 0.2 }
8
+ spider.every_link { |origin,link| puts "link: #{link} (from #{origin})"; sleep 0.2 }
9
+ spider.every_failed_url { |url| puts "failed: #{url}"; sleep 0.2 }
10
+ end
11
+
metadata ADDED
@@ -0,0 +1,172 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seep
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Carl Zulauf
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-27 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redis-native_hash
16
+ requirement: &12752080 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *12752080
25
+ - !ruby/object:Gem::Dependency
26
+ name: gd2-ffij
27
+ requirement: &12751600 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *12751600
36
+ - !ruby/object:Gem::Dependency
37
+ name: curb
38
+ requirement: &12751060 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *12751060
47
+ - !ruby/object:Gem::Dependency
48
+ name: nokogiri
49
+ requirement: &12750520 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *12750520
58
+ - !ruby/object:Gem::Dependency
59
+ name: ruby-debug19
60
+ requirement: &12750040 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *12750040
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: &12749500 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 2.3.0
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *12749500
80
+ - !ruby/object:Gem::Dependency
81
+ name: bundler
82
+ requirement: &12749000 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ~>
86
+ - !ruby/object:Gem::Version
87
+ version: 1.0.0
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: *12749000
91
+ - !ruby/object:Gem::Dependency
92
+ name: jeweler
93
+ requirement: &12611560 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ~>
97
+ - !ruby/object:Gem::Version
98
+ version: 1.6.4
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *12611560
102
+ - !ruby/object:Gem::Dependency
103
+ name: rcov
104
+ requirement: &12610980 !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: *12610980
113
+ description: Collection of web spidering and downloading tools using redis, curl,
114
+ and gd.
115
+ email: carl@linkleaf.com
116
+ executables: []
117
+ extensions: []
118
+ extra_rdoc_files:
119
+ - LICENSE.txt
120
+ - README
121
+ files:
122
+ - .document
123
+ - .rspec
124
+ - Gemfile
125
+ - Gemfile.lock
126
+ - LICENSE.txt
127
+ - README
128
+ - Rakefile
129
+ - doc/small.jpg
130
+ - doc/test_a.html
131
+ - doc/test_b.html
132
+ - lib/seep.rb
133
+ - lib/seep/doc.rb
134
+ - lib/seep/fetcher.rb
135
+ - lib/seep/image.rb
136
+ - seep.gemspec
137
+ - spec/a_spec.rb
138
+ - spec/doc_spec.rb
139
+ - spec/fetcher_spec.rb
140
+ - spec/image_spec.rb
141
+ - spec/spec_helper.rb
142
+ - spider.rb
143
+ - spidr_test.rb
144
+ homepage: http://github.com/carlzulauf/seep
145
+ licenses:
146
+ - MIT
147
+ post_install_message:
148
+ rdoc_options: []
149
+ require_paths:
150
+ - lib
151
+ required_ruby_version: !ruby/object:Gem::Requirement
152
+ none: false
153
+ requirements:
154
+ - - ! '>='
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ segments:
158
+ - 0
159
+ hash: 1561364847905091588
160
+ required_rubygems_version: !ruby/object:Gem::Requirement
161
+ none: false
162
+ requirements:
163
+ - - ! '>='
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ requirements: []
167
+ rubyforge_project:
168
+ rubygems_version: 1.8.10
169
+ signing_key:
170
+ specification_version: 3
171
+ summary: web spidering/downloading tools
172
+ test_files: []