seep 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +63 -0
- data/LICENSE.txt +20 -0
- data/README +1 -0
- data/Rakefile +50 -0
- data/doc/small.jpg +0 -0
- data/doc/test_a.html +1785 -0
- data/doc/test_b.html +730 -0
- data/lib/seep.rb +19 -0
- data/lib/seep/doc.rb +44 -0
- data/lib/seep/fetcher.rb +107 -0
- data/lib/seep/image.rb +18 -0
- data/seep.gemspec +85 -0
- data/spec/a_spec.rb +7 -0
- data/spec/doc_spec.rb +42 -0
- data/spec/fetcher_spec.rb +66 -0
- data/spec/image_spec.rb +35 -0
- data/spec/spec_helper.rb +13 -0
- data/spider.rb +226 -0
- data/spidr_test.rb +11 -0
- metadata +172 -0
data/lib/seep.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'uri'
|
3
|
+
require 'curb'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'gd2-ffij'
|
6
|
+
|
7
|
+
module Seep
|
8
|
+
require 'seep/fetcher'
|
9
|
+
require 'seep/doc'
|
10
|
+
require 'seep/image'
|
11
|
+
|
12
|
+
def self.fetch(*args)
|
13
|
+
Fetcher.open(*args)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.doc(*args)
|
17
|
+
fetch(*args).to_doc
|
18
|
+
end
|
19
|
+
end
|
data/lib/seep/doc.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
class Seep::Doc
|
2
|
+
attr_reader :dom, :url
|
3
|
+
|
4
|
+
def initialize(url, html)
|
5
|
+
@url = url
|
6
|
+
@dom = Nokogiri::HTML.parse(html)
|
7
|
+
end
|
8
|
+
|
9
|
+
def links
|
10
|
+
@links ||= [].tap do |links|
|
11
|
+
@dom.search("a").each do |anchor|
|
12
|
+
link = compute_url( anchor.get_attribute("href") )
|
13
|
+
links << link unless link.nil? or link == url
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def images
|
19
|
+
@images ||= [].tap do |images|
|
20
|
+
@dom.search("img").each do |image|
|
21
|
+
link = compute_url( image.get_attribute("src") )
|
22
|
+
images << link unless link.nil?
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def urls
|
28
|
+
links + images
|
29
|
+
end
|
30
|
+
|
31
|
+
def compute_url( relative_url )
|
32
|
+
return nil if relative_url.nil?
|
33
|
+
url = begin
|
34
|
+
URI.join( @url, relative_url ).to_s
|
35
|
+
rescue
|
36
|
+
URI.join( @url, URI.escape(relative_url) ).to_s
|
37
|
+
end
|
38
|
+
url =~ /^http/i ? url : nil
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"#<Seep::Doc #{url} links: #{links.count}, images: #{images.count}>"
|
43
|
+
end
|
44
|
+
end
|
data/lib/seep/fetcher.rb
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
class Seep::Fetcher
|
2
|
+
AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
|
3
|
+
|
4
|
+
attr_reader :curb, :size, :body
|
5
|
+
attr_accessor :url, :max_file_size
|
6
|
+
attr_accessor :request_headers, :response_headers
|
7
|
+
|
8
|
+
def initialize(url, options = {})
|
9
|
+
self.url = url
|
10
|
+
self.request_headers = {}.tap do |head|
|
11
|
+
head['User-Agent'] = options[:user_agent] || AGENT
|
12
|
+
end
|
13
|
+
self.response_headers = {}
|
14
|
+
self.max_file_size = options[:max_file_size] || 1_048_576 # 1MB
|
15
|
+
|
16
|
+
@curb = Curl::Easy.new(url)
|
17
|
+
@curb.follow_location = true
|
18
|
+
@curb.max_redirects = options[:max_redirects] || 5
|
19
|
+
|
20
|
+
register_on_header!
|
21
|
+
register_on_body!
|
22
|
+
end
|
23
|
+
|
24
|
+
def content_type
|
25
|
+
response_headers['Content-Type']
|
26
|
+
end
|
27
|
+
|
28
|
+
def dest_url
|
29
|
+
curb.last_effective_url || url
|
30
|
+
end
|
31
|
+
|
32
|
+
def open(redirect = 0)
|
33
|
+
@body = ""; @size = 0
|
34
|
+
curb.headers = request_headers
|
35
|
+
curb.perform
|
36
|
+
self
|
37
|
+
end
|
38
|
+
|
39
|
+
def inspect
|
40
|
+
"#<Seep::Fetcher #{ content_type.nil? ? dest_url : content_type + ' ' + dest_url }>"
|
41
|
+
end
|
42
|
+
|
43
|
+
def export(path)
|
44
|
+
File.open(path, 'w') do |file|
|
45
|
+
file.write(@body)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def ext
|
50
|
+
case content_type
|
51
|
+
when "image/jpeg"; ".jpg"
|
52
|
+
when "image/png" ; ".png"
|
53
|
+
when "image/gif" ; ".gif"
|
54
|
+
when "text/html" ; ".html"
|
55
|
+
when "text/plain"; ".txt"
|
56
|
+
else; ""
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def image?
|
61
|
+
(!! content_type =~ /^image/) and to_image.valid?
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_image
|
65
|
+
@image ||= Seep::Image.new(body)
|
66
|
+
end
|
67
|
+
|
68
|
+
def doc?
|
69
|
+
content_type == "text/html"
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_doc
|
73
|
+
@doc ||= Seep::Doc.new(url, body)
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.open(url, options = {})
|
77
|
+
self.new(url, options).open
|
78
|
+
end
|
79
|
+
|
80
|
+
def register_on_header!
|
81
|
+
curb.on_header do |header|
|
82
|
+
key, value = header.split(":", 2)
|
83
|
+
unless key.nil? or value.nil?
|
84
|
+
key.strip!; value.strip!
|
85
|
+
@size = value.to_i if key == "Content-Length"
|
86
|
+
response_headers[key] = value
|
87
|
+
end
|
88
|
+
if @size > max_file_size
|
89
|
+
@size = -1
|
90
|
+
else
|
91
|
+
header.length
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def register_on_body!
|
97
|
+
curb.on_body do |body|
|
98
|
+
@body += body
|
99
|
+
@size = @body.length
|
100
|
+
if @size > max_file_size
|
101
|
+
@size = -1
|
102
|
+
else
|
103
|
+
body.length
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
data/lib/seep/image.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
class Seep::Image
|
2
|
+
extend Forwardable
|
3
|
+
|
4
|
+
attr_reader :image
|
5
|
+
def_delegators :image, :size, :width, :height
|
6
|
+
alias_method :dimensions, :size
|
7
|
+
|
8
|
+
def initialize(data)
|
9
|
+
@valid = begin
|
10
|
+
@image = GD2::Image.load( data ); true
|
11
|
+
rescue
|
12
|
+
false
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def valid?; @valid; end
|
17
|
+
|
18
|
+
end
|
data/seep.gemspec
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "seep"
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Carl Zulauf"]
|
12
|
+
s.date = "2011-12-27"
|
13
|
+
s.description = "Collection of web spidering and downloading tools using redis, curl, and gd."
|
14
|
+
s.email = "carl@linkleaf.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README",
|
26
|
+
"Rakefile",
|
27
|
+
"doc/small.jpg",
|
28
|
+
"doc/test_a.html",
|
29
|
+
"doc/test_b.html",
|
30
|
+
"lib/seep.rb",
|
31
|
+
"lib/seep/doc.rb",
|
32
|
+
"lib/seep/fetcher.rb",
|
33
|
+
"lib/seep/image.rb",
|
34
|
+
"seep.gemspec",
|
35
|
+
"spec/a_spec.rb",
|
36
|
+
"spec/doc_spec.rb",
|
37
|
+
"spec/fetcher_spec.rb",
|
38
|
+
"spec/image_spec.rb",
|
39
|
+
"spec/spec_helper.rb",
|
40
|
+
"spider.rb",
|
41
|
+
"spidr_test.rb"
|
42
|
+
]
|
43
|
+
s.homepage = "http://github.com/carlzulauf/seep"
|
44
|
+
s.licenses = ["MIT"]
|
45
|
+
s.require_paths = ["lib"]
|
46
|
+
s.rubygems_version = "1.8.10"
|
47
|
+
s.summary = "web spidering/downloading tools"
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
s.specification_version = 3
|
51
|
+
|
52
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
53
|
+
s.add_runtime_dependency(%q<redis-native_hash>, [">= 0"])
|
54
|
+
s.add_runtime_dependency(%q<gd2-ffij>, [">= 0"])
|
55
|
+
s.add_runtime_dependency(%q<curb>, [">= 0"])
|
56
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
57
|
+
s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
|
58
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
59
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
60
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
61
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
62
|
+
else
|
63
|
+
s.add_dependency(%q<redis-native_hash>, [">= 0"])
|
64
|
+
s.add_dependency(%q<gd2-ffij>, [">= 0"])
|
65
|
+
s.add_dependency(%q<curb>, [">= 0"])
|
66
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
67
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
68
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
69
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
70
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
71
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
72
|
+
end
|
73
|
+
else
|
74
|
+
s.add_dependency(%q<redis-native_hash>, [">= 0"])
|
75
|
+
s.add_dependency(%q<gd2-ffij>, [">= 0"])
|
76
|
+
s.add_dependency(%q<curb>, [">= 0"])
|
77
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
78
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
79
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
80
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
81
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
82
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
data/spec/a_spec.rb
ADDED
data/spec/doc_spec.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "Seep::Doc" do
|
4
|
+
before :all do
|
5
|
+
@doc_a = Seep::Doc.new(
|
6
|
+
"http://linkleaf.com/test_a.html",
|
7
|
+
File.read("doc/test_a.html")
|
8
|
+
)
|
9
|
+
@doc_b = Seep::Doc.new(
|
10
|
+
"http://examancer.com/gallery/",
|
11
|
+
File.read("doc/test_b.html")
|
12
|
+
)
|
13
|
+
@link_a = "https://exanotes.com/"
|
14
|
+
@link_b = "http://linkleaf.com/index.php?leaf=212"
|
15
|
+
@image_a = "http://l.yimg.com/a/i/us/we/52/34.gif"
|
16
|
+
@image_b = "http://examancer.com/pictures/polished/bolGallery/thumbnail_Good%20Catch%20Missy.jpg"
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "#links" do
|
20
|
+
it "should contain absolute links" do
|
21
|
+
@doc_a.links.member?(@link_a).should be_true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should contain relative links, converted to absolute format" do
|
25
|
+
@doc_a.links.member?(@link_b).should be_true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#images" do
|
30
|
+
it "should contain images from the test pages" do
|
31
|
+
@doc_a.images.member?(@image_a).should be_true
|
32
|
+
@doc_b.images.member?(@image_b).should be_true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#urls" do
|
37
|
+
it "should contain both links and images" do
|
38
|
+
@doc_a.urls.member?(@link_b).should be_true
|
39
|
+
@doc_a.urls.member?(@image_a).should be_true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "Seep::Fetcher" do
|
4
|
+
before :all do
|
5
|
+
@link = Seep::Fetcher.open( "http://linkleaf.com/test_a.html" )
|
6
|
+
@jpg = Seep::Fetcher.open("http://examancer.com/pictures/funny/funny-pictures-ooooooo.jpg")
|
7
|
+
@png = Seep::Fetcher.open("http://examancer.com/pictures/funny/cnnpoll.png")
|
8
|
+
@gif = Seep::Fetcher.open("http://examancer.com/pictures/funny/lassie-shreds.gif")
|
9
|
+
@txt = Seep::Fetcher.open("http://examancer.com/remoteip.txt")
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#open" do
|
13
|
+
it "populates #size" do
|
14
|
+
@link.size.should == 56_607
|
15
|
+
end
|
16
|
+
it "populates #response_headers" do
|
17
|
+
@link.response_headers["Server"].should match(/Apache/)
|
18
|
+
end
|
19
|
+
it "throws exception when :max_file_size is exceeded" do
|
20
|
+
lambda do
|
21
|
+
Seep::Fetcher.open( @link.url, max_file_size: 25_000 )
|
22
|
+
end.should( raise_exception(Curl::Err::WriteError) )
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "#size" do
|
27
|
+
it "should be the same as #body.length" do
|
28
|
+
@link.size.should == @link.body.length
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "#dest_url" do
|
33
|
+
it "should show the url resulting from a redirect" do
|
34
|
+
Seep::Fetcher.open("http://reddit.com").dest_url.should == "http://www.reddit.com/"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#content_type" do
|
39
|
+
it "should be nil before #open is called" do
|
40
|
+
link = Seep::Fetcher.new("http://linkleaf.com/")
|
41
|
+
link.content_type.should be_nil
|
42
|
+
end
|
43
|
+
it "returns the correct type for html" do
|
44
|
+
@link.content_type.should == "text/html"
|
45
|
+
end
|
46
|
+
it "returns the correct type for images" do
|
47
|
+
@jpg.content_type.should == "image/jpeg"
|
48
|
+
@png.content_type.should == "image/png"
|
49
|
+
@gif.content_type.should == "image/gif"
|
50
|
+
end
|
51
|
+
it "returns the correct type for text" do
|
52
|
+
@txt.content_type.should == "text/plain"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "#ext" do
|
57
|
+
it "should be blank when #open has not been called" do
|
58
|
+
Seep::Fetcher.new(@link.url)
|
59
|
+
end
|
60
|
+
it "should give proper extensions for images" do
|
61
|
+
@jpg.ext.should == ".jpg"
|
62
|
+
@png.ext.should == ".png"
|
63
|
+
@gif.ext.should == ".gif"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/spec/image_spec.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "Seep::Image" do
|
4
|
+
before :all do
|
5
|
+
@small = Seep::Image.new( File.read("doc/small.jpg") )
|
6
|
+
@bad = Seep::Image.new("Not an image")
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#dimensions" do
|
10
|
+
it "should provide the dimensions of an image" do
|
11
|
+
@small.dimensions.should == [320, 240]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#width" do
|
16
|
+
it "should provide the width of an image" do
|
17
|
+
@small.width.should == 320
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#height" do
|
22
|
+
it "should provide the height of an image" do
|
23
|
+
@small.height.should == 240
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#valid?" do
|
28
|
+
it "should return true if the image is good" do
|
29
|
+
@small.valid?.should be_true
|
30
|
+
end
|
31
|
+
it "should return false if the image is bad" do
|
32
|
+
@bad.valid?.should be_false
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'seep'
|
5
|
+
require 'ruby-debug'
|
6
|
+
|
7
|
+
# Requires supporting files with custom matchers and macros, etc,
|
8
|
+
# in ./support/ and its subdirectories.
|
9
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
10
|
+
|
11
|
+
RSpec.configure do |config|
|
12
|
+
|
13
|
+
end
|