seep 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +63 -0
- data/LICENSE.txt +20 -0
- data/README +1 -0
- data/Rakefile +50 -0
- data/doc/small.jpg +0 -0
- data/doc/test_a.html +1785 -0
- data/doc/test_b.html +730 -0
- data/lib/seep.rb +19 -0
- data/lib/seep/doc.rb +44 -0
- data/lib/seep/fetcher.rb +107 -0
- data/lib/seep/image.rb +18 -0
- data/seep.gemspec +85 -0
- data/spec/a_spec.rb +7 -0
- data/spec/doc_spec.rb +42 -0
- data/spec/fetcher_spec.rb +66 -0
- data/spec/image_spec.rb +35 -0
- data/spec/spec_helper.rb +13 -0
- data/spider.rb +226 -0
- data/spidr_test.rb +11 -0
- metadata +172 -0
data/lib/seep.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'uri'
|
3
|
+
require 'curb'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'gd2-ffij'
|
6
|
+
|
7
|
+
module Seep
|
8
|
+
require 'seep/fetcher'
|
9
|
+
require 'seep/doc'
|
10
|
+
require 'seep/image'
|
11
|
+
|
12
|
+
def self.fetch(*args)
|
13
|
+
Fetcher.open(*args)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.doc(*args)
|
17
|
+
fetch(*args).to_doc
|
18
|
+
end
|
19
|
+
end
|
data/lib/seep/doc.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
class Seep::Doc
|
2
|
+
attr_reader :dom, :url
|
3
|
+
|
4
|
+
def initialize(url, html)
|
5
|
+
@url = url
|
6
|
+
@dom = Nokogiri::HTML.parse(html)
|
7
|
+
end
|
8
|
+
|
9
|
+
def links
|
10
|
+
@links ||= [].tap do |links|
|
11
|
+
@dom.search("a").each do |anchor|
|
12
|
+
link = compute_url( anchor.get_attribute("href") )
|
13
|
+
links << link unless link.nil? or link == url
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def images
|
19
|
+
@images ||= [].tap do |images|
|
20
|
+
@dom.search("img").each do |image|
|
21
|
+
link = compute_url( image.get_attribute("src") )
|
22
|
+
images << link unless link.nil?
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def urls
|
28
|
+
links + images
|
29
|
+
end
|
30
|
+
|
31
|
+
def compute_url( relative_url )
|
32
|
+
return nil if relative_url.nil?
|
33
|
+
url = begin
|
34
|
+
URI.join( @url, relative_url ).to_s
|
35
|
+
rescue
|
36
|
+
URI.join( @url, URI.escape(relative_url) ).to_s
|
37
|
+
end
|
38
|
+
url =~ /^http/i ? url : nil
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"#<Seep::Doc #{url} links: #{links.count}, images: #{images.count}>"
|
43
|
+
end
|
44
|
+
end
|
data/lib/seep/fetcher.rb
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
class Seep::Fetcher
|
2
|
+
AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
|
3
|
+
|
4
|
+
attr_reader :curb, :size, :body
|
5
|
+
attr_accessor :url, :max_file_size
|
6
|
+
attr_accessor :request_headers, :response_headers
|
7
|
+
|
8
|
+
def initialize(url, options = {})
|
9
|
+
self.url = url
|
10
|
+
self.request_headers = {}.tap do |head|
|
11
|
+
head['User-Agent'] = options[:user_agent] || AGENT
|
12
|
+
end
|
13
|
+
self.response_headers = {}
|
14
|
+
self.max_file_size = options[:max_file_size] || 1_048_576 # 1MB
|
15
|
+
|
16
|
+
@curb = Curl::Easy.new(url)
|
17
|
+
@curb.follow_location = true
|
18
|
+
@curb.max_redirects = options[:max_redirects] || 5
|
19
|
+
|
20
|
+
register_on_header!
|
21
|
+
register_on_body!
|
22
|
+
end
|
23
|
+
|
24
|
+
def content_type
|
25
|
+
response_headers['Content-Type']
|
26
|
+
end
|
27
|
+
|
28
|
+
def dest_url
|
29
|
+
curb.last_effective_url || url
|
30
|
+
end
|
31
|
+
|
32
|
+
def open(redirect = 0)
|
33
|
+
@body = ""; @size = 0
|
34
|
+
curb.headers = request_headers
|
35
|
+
curb.perform
|
36
|
+
self
|
37
|
+
end
|
38
|
+
|
39
|
+
def inspect
|
40
|
+
"#<Seep::Fetcher #{ content_type.nil? ? dest_url : content_type + ' ' + dest_url }>"
|
41
|
+
end
|
42
|
+
|
43
|
+
def export(path)
|
44
|
+
File.open(path, 'w') do |file|
|
45
|
+
file.write(@body)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def ext
|
50
|
+
case content_type
|
51
|
+
when "image/jpeg"; ".jpg"
|
52
|
+
when "image/png" ; ".png"
|
53
|
+
when "image/gif" ; ".gif"
|
54
|
+
when "text/html" ; ".html"
|
55
|
+
when "text/plain"; ".txt"
|
56
|
+
else; ""
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def image?
|
61
|
+
(!! content_type =~ /^image/) and to_image.valid?
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_image
|
65
|
+
@image ||= Seep::Image.new(body)
|
66
|
+
end
|
67
|
+
|
68
|
+
def doc?
|
69
|
+
content_type == "text/html"
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_doc
|
73
|
+
@doc ||= Seep::Doc.new(url, body)
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.open(url, options = {})
|
77
|
+
self.new(url, options).open
|
78
|
+
end
|
79
|
+
|
80
|
+
def register_on_header!
|
81
|
+
curb.on_header do |header|
|
82
|
+
key, value = header.split(":", 2)
|
83
|
+
unless key.nil? or value.nil?
|
84
|
+
key.strip!; value.strip!
|
85
|
+
@size = value.to_i if key == "Content-Length"
|
86
|
+
response_headers[key] = value
|
87
|
+
end
|
88
|
+
if @size > max_file_size
|
89
|
+
@size = -1
|
90
|
+
else
|
91
|
+
header.length
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def register_on_body!
|
97
|
+
curb.on_body do |body|
|
98
|
+
@body += body
|
99
|
+
@size = @body.length
|
100
|
+
if @size > max_file_size
|
101
|
+
@size = -1
|
102
|
+
else
|
103
|
+
body.length
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
data/lib/seep/image.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
class Seep::Image
|
2
|
+
extend Forwardable
|
3
|
+
|
4
|
+
attr_reader :image
|
5
|
+
def_delegators :image, :size, :width, :height
|
6
|
+
alias_method :dimensions, :size
|
7
|
+
|
8
|
+
def initialize(data)
|
9
|
+
@valid = begin
|
10
|
+
@image = GD2::Image.load( data ); true
|
11
|
+
rescue
|
12
|
+
false
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def valid?; @valid; end
|
17
|
+
|
18
|
+
end
|
data/seep.gemspec
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "seep"
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Carl Zulauf"]
|
12
|
+
s.date = "2011-12-27"
|
13
|
+
s.description = "Collection of web spidering and downloading tools using redis, curl, and gd."
|
14
|
+
s.email = "carl@linkleaf.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README",
|
26
|
+
"Rakefile",
|
27
|
+
"doc/small.jpg",
|
28
|
+
"doc/test_a.html",
|
29
|
+
"doc/test_b.html",
|
30
|
+
"lib/seep.rb",
|
31
|
+
"lib/seep/doc.rb",
|
32
|
+
"lib/seep/fetcher.rb",
|
33
|
+
"lib/seep/image.rb",
|
34
|
+
"seep.gemspec",
|
35
|
+
"spec/a_spec.rb",
|
36
|
+
"spec/doc_spec.rb",
|
37
|
+
"spec/fetcher_spec.rb",
|
38
|
+
"spec/image_spec.rb",
|
39
|
+
"spec/spec_helper.rb",
|
40
|
+
"spider.rb",
|
41
|
+
"spidr_test.rb"
|
42
|
+
]
|
43
|
+
s.homepage = "http://github.com/carlzulauf/seep"
|
44
|
+
s.licenses = ["MIT"]
|
45
|
+
s.require_paths = ["lib"]
|
46
|
+
s.rubygems_version = "1.8.10"
|
47
|
+
s.summary = "web spidering/downloading tools"
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
s.specification_version = 3
|
51
|
+
|
52
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
53
|
+
s.add_runtime_dependency(%q<redis-native_hash>, [">= 0"])
|
54
|
+
s.add_runtime_dependency(%q<gd2-ffij>, [">= 0"])
|
55
|
+
s.add_runtime_dependency(%q<curb>, [">= 0"])
|
56
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
57
|
+
s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
|
58
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
59
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
60
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
61
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
62
|
+
else
|
63
|
+
s.add_dependency(%q<redis-native_hash>, [">= 0"])
|
64
|
+
s.add_dependency(%q<gd2-ffij>, [">= 0"])
|
65
|
+
s.add_dependency(%q<curb>, [">= 0"])
|
66
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
67
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
68
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
69
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
70
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
71
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
72
|
+
end
|
73
|
+
else
|
74
|
+
s.add_dependency(%q<redis-native_hash>, [">= 0"])
|
75
|
+
s.add_dependency(%q<gd2-ffij>, [">= 0"])
|
76
|
+
s.add_dependency(%q<curb>, [">= 0"])
|
77
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
78
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
79
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
80
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
81
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
82
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
data/spec/a_spec.rb
ADDED
data/spec/doc_spec.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "Seep::Doc" do
|
4
|
+
before :all do
|
5
|
+
@doc_a = Seep::Doc.new(
|
6
|
+
"http://linkleaf.com/test_a.html",
|
7
|
+
File.read("doc/test_a.html")
|
8
|
+
)
|
9
|
+
@doc_b = Seep::Doc.new(
|
10
|
+
"http://examancer.com/gallery/",
|
11
|
+
File.read("doc/test_b.html")
|
12
|
+
)
|
13
|
+
@link_a = "https://exanotes.com/"
|
14
|
+
@link_b = "http://linkleaf.com/index.php?leaf=212"
|
15
|
+
@image_a = "http://l.yimg.com/a/i/us/we/52/34.gif"
|
16
|
+
@image_b = "http://examancer.com/pictures/polished/bolGallery/thumbnail_Good%20Catch%20Missy.jpg"
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "#links" do
|
20
|
+
it "should contain absolute links" do
|
21
|
+
@doc_a.links.member?(@link_a).should be_true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should contain relative links, converted to absolute format" do
|
25
|
+
@doc_a.links.member?(@link_b).should be_true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#images" do
|
30
|
+
it "should contain images from the test pages" do
|
31
|
+
@doc_a.images.member?(@image_a).should be_true
|
32
|
+
@doc_b.images.member?(@image_b).should be_true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#urls" do
|
37
|
+
it "should contain both links and images" do
|
38
|
+
@doc_a.urls.member?(@link_b).should be_true
|
39
|
+
@doc_a.urls.member?(@image_a).should be_true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "Seep::Fetcher" do
|
4
|
+
before :all do
|
5
|
+
@link = Seep::Fetcher.open( "http://linkleaf.com/test_a.html" )
|
6
|
+
@jpg = Seep::Fetcher.open("http://examancer.com/pictures/funny/funny-pictures-ooooooo.jpg")
|
7
|
+
@png = Seep::Fetcher.open("http://examancer.com/pictures/funny/cnnpoll.png")
|
8
|
+
@gif = Seep::Fetcher.open("http://examancer.com/pictures/funny/lassie-shreds.gif")
|
9
|
+
@txt = Seep::Fetcher.open("http://examancer.com/remoteip.txt")
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#open" do
|
13
|
+
it "populates #size" do
|
14
|
+
@link.size.should == 56_607
|
15
|
+
end
|
16
|
+
it "populates #response_headers" do
|
17
|
+
@link.response_headers["Server"].should match(/Apache/)
|
18
|
+
end
|
19
|
+
it "throws exception when :max_file_size is exceeded" do
|
20
|
+
lambda do
|
21
|
+
Seep::Fetcher.open( @link.url, max_file_size: 25_000 )
|
22
|
+
end.should( raise_exception(Curl::Err::WriteError) )
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "#size" do
|
27
|
+
it "should be the same as #body.length" do
|
28
|
+
@link.size.should == @link.body.length
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "#dest_url" do
|
33
|
+
it "should show the url resulting from a redirect" do
|
34
|
+
Seep::Fetcher.open("http://reddit.com").dest_url.should == "http://www.reddit.com/"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#content_type" do
|
39
|
+
it "should be nil before #open is called" do
|
40
|
+
link = Seep::Fetcher.new("http://linkleaf.com/")
|
41
|
+
link.content_type.should be_nil
|
42
|
+
end
|
43
|
+
it "returns the correct type for html" do
|
44
|
+
@link.content_type.should == "text/html"
|
45
|
+
end
|
46
|
+
it "returns the correct type for images" do
|
47
|
+
@jpg.content_type.should == "image/jpeg"
|
48
|
+
@png.content_type.should == "image/png"
|
49
|
+
@gif.content_type.should == "image/gif"
|
50
|
+
end
|
51
|
+
it "returns the correct type for text" do
|
52
|
+
@txt.content_type.should == "text/plain"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "#ext" do
|
57
|
+
it "should be blank when #open has not been called" do
|
58
|
+
Seep::Fetcher.new(@link.url)
|
59
|
+
end
|
60
|
+
it "should give proper extensions for images" do
|
61
|
+
@jpg.ext.should == ".jpg"
|
62
|
+
@png.ext.should == ".png"
|
63
|
+
@gif.ext.should == ".gif"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/spec/image_spec.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "Seep::Image" do
|
4
|
+
before :all do
|
5
|
+
@small = Seep::Image.new( File.read("doc/small.jpg") )
|
6
|
+
@bad = Seep::Image.new("Not an image")
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#dimensions" do
|
10
|
+
it "should provide the dimensions of an image" do
|
11
|
+
@small.dimensions.should == [320, 240]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#width" do
|
16
|
+
it "should provide the width of an image" do
|
17
|
+
@small.width.should == 320
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#height" do
|
22
|
+
it "should provide the height of an image" do
|
23
|
+
@small.height.should == 240
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#valid?" do
|
28
|
+
it "should return true if the image is good" do
|
29
|
+
@small.valid?.should be_true
|
30
|
+
end
|
31
|
+
it "should return false if the image is bad" do
|
32
|
+
@bad.valid?.should be_false
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'seep'
|
5
|
+
require 'ruby-debug'
|
6
|
+
|
7
|
+
# Requires supporting files with custom matchers and macros, etc,
|
8
|
+
# in ./support/ and its subdirectories.
|
9
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
10
|
+
|
11
|
+
RSpec.configure do |config|
|
12
|
+
|
13
|
+
end
|