seep 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,226 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'fileutils'
5
+ require 'pp'
6
+ require 'singleton'
7
+ require 'net/http'
8
+
9
+ def grab_data_from_uri (link)
10
+ #uri = link.uri
11
+ headers = {
12
+ 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
13
+ }
14
+ headers['Referer'] = link.prev_uri.to_s if not link.prev_uri.nil?
15
+ req_path = link.uri.path
16
+ req_path = req_path + "?#{link.uri.query}" if not link.uri.query.nil?
17
+ req_path = "/" if req_path == "" or req_path.nil?
18
+ begin
19
+ res = Net::HTTP.start(link.uri.host) { |http|
20
+ http.get(req_path, headers)
21
+ }
22
+ rescue Net::HTTPExceptions => e
23
+ p "something is going wrong"
24
+ pp e
25
+ return nil
26
+ rescue Errno::ETIMEDOUT
27
+ p "timeout error"
28
+ return nil
29
+ rescue Errno::ECONNRESET
30
+ p "connection reset"
31
+ return nil
32
+ rescue EOFError
33
+ p "end of file error"
34
+ return nil
35
+ rescue NoMethodError
36
+ p "weird error"
37
+ return nil
38
+ rescue SystemExit
39
+ exit
40
+ rescue
41
+ p "some other error"
42
+ return nil
43
+ end
44
+ #pp res
45
+ return res.body if (res.is_a? Net::HTTPSuccess)
46
+ if (res.is_a? Net::HTTPRedirection)
47
+ p "Redirection detected."
48
+ if ( not res.header['location'].nil? )
49
+ begin
50
+ new_link = SpiderLink.new(URI::join(link.uri.to_s,res.header['location']),link.uri,link.depth)
51
+ rescue
52
+ return nil
53
+ end
54
+ p "Redirected to: #{new_link.uri.to_s}"
55
+ SpiderConfig.instance.queued_links << new_link
56
+ end
57
+ end
58
+ return nil
59
+ end
60
+
61
+ def open_link (link)
62
+ grab_data_from_uri(link)
63
+ end
64
+
65
+ #url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
66
+ #p open_link(url)
67
+
68
+ def get_links(uri,html)
69
+ return [],[] if html.nil?
70
+ #begin
71
+ doc = Nokogiri::HTML.parse(html)
72
+ #rescue NoMethodError #capture parsing errors
73
+ # return [],[]
74
+ #end
75
+ images = []; links = []
76
+ doc.search("a").each do |e|
77
+ link = e.get_attribute("href")
78
+ #p link
79
+ begin
80
+ new_uri = URI::join(uri.to_s,link) if not link.nil?
81
+ rescue
82
+ p "Bad URI. Skipping. #{link}"
83
+ next
84
+ end
85
+ if (new_uri.to_s =~ /jpe?g$/i) then
86
+ images << new_uri
87
+ else
88
+ links << new_uri
89
+ end
90
+ end
91
+ return links.uniq, images.uniq
92
+ end
93
+
94
+ #url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
95
+ #html = open_link(url)
96
+ #links, images = get_links(url,html)
97
+ #p "links: "
98
+ #pp links
99
+ #p "images: "
100
+ #pp images
101
+
102
+ $base = "/mnt/media/stuff/porn/"
103
+ def save_image(uri,name,parent)
104
+ dir = $base + uri.host
105
+ path = "#{dir}/#{name}.jpg"
106
+ FileUtils.mkdir_p(dir)
107
+ res = grab_data_from_uri(SpiderLink.new(uri,parent,0))
108
+ p "Saving #{uri.to_s} to #{path}"
109
+ File.open(path,"w").write(res)
110
+ end
111
+
112
+ def random_string(length=10)
113
+ chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
114
+ password = ''
115
+ length.times { password << chars[rand(chars.size)] }
116
+ password
117
+ end
118
+
119
+ #url = "http://galleries.nextdoor-models.com/content/149/thm/01.jpg"
120
+ #uri = URI::parse(url)
121
+ #save_image(uri,1001)
122
+
123
+ #url = "http://galleries.nextdoor-models.com/static/clean/149.html?ccbill_id=1757428&site_link=http://www.nextdoor-models.com/"
124
+ #html = open_link(url)
125
+ #links, images = get_links(url,html)
126
+ #batch_id = 1
127
+ #image_count = 0
128
+ #images.each do |img|
129
+ # image_count += 1
130
+ # save_image(img,"%08s.%08d.jpg" % [ batch_id, image_count ] )
131
+ #end
132
+
133
+ def proc_url(link)
134
+ html = open_link(link)
135
+ #p html
136
+ links, images = get_links(link.uri,html)
137
+ batch = Time.now.strftime("%Y%m%d.%H%M.%S.") + random_string(4)
138
+ image_count = 0
139
+ images.each do |img|
140
+ image_count += 1
141
+ save_image(img,"#{batch}.%04d" % image_count, link.uri )
142
+ end
143
+ links
144
+ end
145
+
146
+ class SpiderLink
147
+ attr_accessor :uri, :prev_uri, :depth
148
+ def initialize ( uri, prev_uri, depth )
149
+ @uri = uri
150
+ @prev_uri = prev_uri
151
+ @depth = depth
152
+ end
153
+ def == (other)
154
+ ret = @uri.to_s == other.to_s
155
+ p "Compare #{@uri.to_s} to #{other.uri.to_s}: #{ret}" if ret
156
+ return @uri.to_s == other.uri.to_s
157
+ end
158
+ end
159
+
160
+ class SpiderConfig
161
+ include Singleton
162
+ attr_accessor :max_depth, :root_hosts, :queued_links, :seen_links, :max_queue_size
163
+ def initialize
164
+ @max_depth = 5
165
+ @max_queue_size = 100000
166
+ @root_hosts = []
167
+ @queued_links = SpiderQueue.new
168
+ @seen_links = []
169
+ end
170
+ end
171
+
172
+ class SpiderQueue < Hash
173
+ def initialize
174
+ super
175
+ @ordered_queue = []
176
+ end
177
+
178
+ def << (link)
179
+ if self.has_key?(link.uri)
180
+ @ordered_queue.delete(link.uri)
181
+ else
182
+ self[link.uri] = link
183
+ end
184
+ @ordered_queue << link.uri
185
+ end
186
+
187
+ def shift
188
+ self.delete(@ordered_queue.shift)
189
+ end
190
+ end
191
+
192
+ def link_loop(start_url)
193
+ c = SpiderConfig.instance
194
+
195
+ start_uri = URI::parse(start_url)
196
+ c.queued_links << SpiderLink.new(start_uri,nil,0)
197
+ p c.queued_links
198
+ c.root_hosts << start_uri.host
199
+
200
+ loop do
201
+ link = c.queued_links.shift
202
+ if link.nil? then
203
+ p "Queue is empty. Quitting"
204
+ break
205
+ end
206
+ #if c.seen_links.include?(link.uri) then
207
+ # p "Already seen #{link.uri.to_s}. Skipping."
208
+ # next
209
+ #end
210
+ #c.seen_links << link.uri
211
+ new_links = proc_url(link) - (c.seen_links - c.queued_links.keys)
212
+ p "Q:#{c.queued_links.size} Seen:#{c.seen_links.size} #{link.uri.to_s} - #{new_links.size}"
213
+ new_links.each do |found_link|
214
+ next if found_link.nil?
215
+ #p "Found: #{found_link.to_s}"
216
+ depth = c.root_hosts.include?(found_link.host) ? link.depth : link.depth + 1
217
+ unless (depth > c.max_depth or c.queued_links.size > c.max_queue_size)
218
+ #q_link = SpiderLink.new(found_link,link.uri,depth)
219
+ c.queued_links << SpiderLink.new(found_link,link.uri,depth)
220
+ c.seen_links << found_link
221
+ #p "Added #{found_link.to_s} to queue"
222
+ end
223
+ end
224
+ end
225
+ end
226
+
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'spidr'
3
+
4
+ p "Spider test!"
5
+
6
+ Spidr.site('http://www.definebabe.com/gallery/kme/jana-cova/') do |spider|
7
+ spider.every_url { |url| puts "url: #{url}"; sleep 0.2 }
8
+ spider.every_link { |origin,link| puts "link: #{link} (from #{origin})"; sleep 0.2 }
9
+ spider.every_failed_url { |url| puts "failed: #{url}"; sleep 0.2 }
10
+ end
11
+
metadata ADDED
@@ -0,0 +1,172 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seep
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Carl Zulauf
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-27 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redis-native_hash
16
+ requirement: &12752080 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *12752080
25
+ - !ruby/object:Gem::Dependency
26
+ name: gd2-ffij
27
+ requirement: &12751600 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *12751600
36
+ - !ruby/object:Gem::Dependency
37
+ name: curb
38
+ requirement: &12751060 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *12751060
47
+ - !ruby/object:Gem::Dependency
48
+ name: nokogiri
49
+ requirement: &12750520 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *12750520
58
+ - !ruby/object:Gem::Dependency
59
+ name: ruby-debug19
60
+ requirement: &12750040 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *12750040
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: &12749500 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 2.3.0
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *12749500
80
+ - !ruby/object:Gem::Dependency
81
+ name: bundler
82
+ requirement: &12749000 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ~>
86
+ - !ruby/object:Gem::Version
87
+ version: 1.0.0
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: *12749000
91
+ - !ruby/object:Gem::Dependency
92
+ name: jeweler
93
+ requirement: &12611560 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ~>
97
+ - !ruby/object:Gem::Version
98
+ version: 1.6.4
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *12611560
102
+ - !ruby/object:Gem::Dependency
103
+ name: rcov
104
+ requirement: &12610980 !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: *12610980
113
+ description: Collection of web spidering and downloading tools using redis, curl,
114
+ and gd.
115
+ email: carl@linkleaf.com
116
+ executables: []
117
+ extensions: []
118
+ extra_rdoc_files:
119
+ - LICENSE.txt
120
+ - README
121
+ files:
122
+ - .document
123
+ - .rspec
124
+ - Gemfile
125
+ - Gemfile.lock
126
+ - LICENSE.txt
127
+ - README
128
+ - Rakefile
129
+ - doc/small.jpg
130
+ - doc/test_a.html
131
+ - doc/test_b.html
132
+ - lib/seep.rb
133
+ - lib/seep/doc.rb
134
+ - lib/seep/fetcher.rb
135
+ - lib/seep/image.rb
136
+ - seep.gemspec
137
+ - spec/a_spec.rb
138
+ - spec/doc_spec.rb
139
+ - spec/fetcher_spec.rb
140
+ - spec/image_spec.rb
141
+ - spec/spec_helper.rb
142
+ - spider.rb
143
+ - spidr_test.rb
144
+ homepage: http://github.com/carlzulauf/seep
145
+ licenses:
146
+ - MIT
147
+ post_install_message:
148
+ rdoc_options: []
149
+ require_paths:
150
+ - lib
151
+ required_ruby_version: !ruby/object:Gem::Requirement
152
+ none: false
153
+ requirements:
154
+ - - ! '>='
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ segments:
158
+ - 0
159
+ hash: 1561364847905091588
160
+ required_rubygems_version: !ruby/object:Gem::Requirement
161
+ none: false
162
+ requirements:
163
+ - - ! '>='
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ requirements: []
167
+ rubyforge_project:
168
+ rubygems_version: 1.8.10
169
+ signing_key:
170
+ specification_version: 3
171
+ summary: web spidering/downloading tools
172
+ test_files: []