seep 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ require 'forwardable'
2
+ require 'uri'
3
+ require 'curb'
4
+ require 'nokogiri'
5
+ require 'gd2-ffij'
6
+
7
+ module Seep
8
+ require 'seep/fetcher'
9
+ require 'seep/doc'
10
+ require 'seep/image'
11
+
12
+ def self.fetch(*args)
13
+ Fetcher.open(*args)
14
+ end
15
+
16
+ def self.doc(*args)
17
+ fetch(*args).to_doc
18
+ end
19
+ end
@@ -0,0 +1,44 @@
1
+ class Seep::Doc
2
+ attr_reader :dom, :url
3
+
4
+ def initialize(url, html)
5
+ @url = url
6
+ @dom = Nokogiri::HTML.parse(html)
7
+ end
8
+
9
+ def links
10
+ @links ||= [].tap do |links|
11
+ @dom.search("a").each do |anchor|
12
+ link = compute_url( anchor.get_attribute("href") )
13
+ links << link unless link.nil? or link == url
14
+ end
15
+ end
16
+ end
17
+
18
+ def images
19
+ @images ||= [].tap do |images|
20
+ @dom.search("img").each do |image|
21
+ link = compute_url( image.get_attribute("src") )
22
+ images << link unless link.nil?
23
+ end
24
+ end
25
+ end
26
+
27
+ def urls
28
+ links + images
29
+ end
30
+
31
+ def compute_url( relative_url )
32
+ return nil if relative_url.nil?
33
+ url = begin
34
+ URI.join( @url, relative_url ).to_s
35
+ rescue
36
+ URI.join( @url, URI.escape(relative_url) ).to_s
37
+ end
38
+ url =~ /^http/i ? url : nil
39
+ end
40
+
41
+ def inspect
42
+ "#<Seep::Doc #{url} links: #{links.count}, images: #{images.count}>"
43
+ end
44
+ end
@@ -0,0 +1,107 @@
1
+ class Seep::Fetcher
2
+ AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
3
+
4
+ attr_reader :curb, :size, :body
5
+ attr_accessor :url, :max_file_size
6
+ attr_accessor :request_headers, :response_headers
7
+
8
+ def initialize(url, options = {})
9
+ self.url = url
10
+ self.request_headers = {}.tap do |head|
11
+ head['User-Agent'] = options[:user_agent] || AGENT
12
+ end
13
+ self.response_headers = {}
14
+ self.max_file_size = options[:max_file_size] || 1_048_576 # 1MB
15
+
16
+ @curb = Curl::Easy.new(url)
17
+ @curb.follow_location = true
18
+ @curb.max_redirects = options[:max_redirects] || 5
19
+
20
+ register_on_header!
21
+ register_on_body!
22
+ end
23
+
24
+ def content_type
25
+ response_headers['Content-Type']
26
+ end
27
+
28
+ def dest_url
29
+ curb.last_effective_url || url
30
+ end
31
+
32
+ def open(redirect = 0)
33
+ @body = ""; @size = 0
34
+ curb.headers = request_headers
35
+ curb.perform
36
+ self
37
+ end
38
+
39
+ def inspect
40
+ "#<Seep::Fetcher #{ content_type.nil? ? dest_url : content_type + ' ' + dest_url }>"
41
+ end
42
+
43
+ def export(path)
44
+ File.open(path, 'w') do |file|
45
+ file.write(@body)
46
+ end
47
+ end
48
+
49
+ def ext
50
+ case content_type
51
+ when "image/jpeg"; ".jpg"
52
+ when "image/png" ; ".png"
53
+ when "image/gif" ; ".gif"
54
+ when "text/html" ; ".html"
55
+ when "text/plain"; ".txt"
56
+ else; ""
57
+ end
58
+ end
59
+
60
+ def image?
61
+ (!! content_type =~ /^image/) and to_image.valid?
62
+ end
63
+
64
+ def to_image
65
+ @image ||= Seep::Image.new(body)
66
+ end
67
+
68
+ def doc?
69
+ content_type == "text/html"
70
+ end
71
+
72
+ def to_doc
73
+ @doc ||= Seep::Doc.new(url, body)
74
+ end
75
+
76
+ def self.open(url, options = {})
77
+ self.new(url, options).open
78
+ end
79
+
80
+ def register_on_header!
81
+ curb.on_header do |header|
82
+ key, value = header.split(":", 2)
83
+ unless key.nil? or value.nil?
84
+ key.strip!; value.strip!
85
+ @size = value.to_i if key == "Content-Length"
86
+ response_headers[key] = value
87
+ end
88
+ if @size > max_file_size
89
+ @size = -1
90
+ else
91
+ header.length
92
+ end
93
+ end
94
+ end
95
+
96
+ def register_on_body!
97
+ curb.on_body do |body|
98
+ @body += body
99
+ @size = @body.length
100
+ if @size > max_file_size
101
+ @size = -1
102
+ else
103
+ body.length
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,18 @@
1
+ class Seep::Image
2
+ extend Forwardable
3
+
4
+ attr_reader :image
5
+ def_delegators :image, :size, :width, :height
6
+ alias_method :dimensions, :size
7
+
8
+ def initialize(data)
9
+ @valid = begin
10
+ @image = GD2::Image.load( data ); true
11
+ rescue
12
+ false
13
+ end
14
+ end
15
+
16
+ def valid?; @valid; end
17
+
18
+ end
@@ -0,0 +1,85 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "seep"
8
+ s.version = "0.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Carl Zulauf"]
12
+ s.date = "2011-12-27"
13
+ s.description = "Collection of web spidering and downloading tools using redis, curl, and gd."
14
+ s.email = "carl@linkleaf.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README",
26
+ "Rakefile",
27
+ "doc/small.jpg",
28
+ "doc/test_a.html",
29
+ "doc/test_b.html",
30
+ "lib/seep.rb",
31
+ "lib/seep/doc.rb",
32
+ "lib/seep/fetcher.rb",
33
+ "lib/seep/image.rb",
34
+ "seep.gemspec",
35
+ "spec/a_spec.rb",
36
+ "spec/doc_spec.rb",
37
+ "spec/fetcher_spec.rb",
38
+ "spec/image_spec.rb",
39
+ "spec/spec_helper.rb",
40
+ "spider.rb",
41
+ "spidr_test.rb"
42
+ ]
43
+ s.homepage = "http://github.com/carlzulauf/seep"
44
+ s.licenses = ["MIT"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = "1.8.10"
47
+ s.summary = "web spidering/downloading tools"
48
+
49
+ if s.respond_to? :specification_version then
50
+ s.specification_version = 3
51
+
52
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
53
+ s.add_runtime_dependency(%q<redis-native_hash>, [">= 0"])
54
+ s.add_runtime_dependency(%q<gd2-ffij>, [">= 0"])
55
+ s.add_runtime_dependency(%q<curb>, [">= 0"])
56
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
57
+ s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
58
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
59
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
61
+ s.add_development_dependency(%q<rcov>, [">= 0"])
62
+ else
63
+ s.add_dependency(%q<redis-native_hash>, [">= 0"])
64
+ s.add_dependency(%q<gd2-ffij>, [">= 0"])
65
+ s.add_dependency(%q<curb>, [">= 0"])
66
+ s.add_dependency(%q<nokogiri>, [">= 0"])
67
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
68
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
69
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
70
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
71
+ s.add_dependency(%q<rcov>, [">= 0"])
72
+ end
73
+ else
74
+ s.add_dependency(%q<redis-native_hash>, [">= 0"])
75
+ s.add_dependency(%q<gd2-ffij>, [">= 0"])
76
+ s.add_dependency(%q<curb>, [">= 0"])
77
+ s.add_dependency(%q<nokogiri>, [">= 0"])
78
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
79
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
80
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
81
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
82
+ s.add_dependency(%q<rcov>, [">= 0"])
83
+ end
84
+ end
85
+
@@ -0,0 +1,7 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "stuff" do
4
+ it "should pass" do
5
+ true.should be_true
6
+ end
7
+ end
@@ -0,0 +1,42 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Seep::Doc" do
4
+ before :all do
5
+ @doc_a = Seep::Doc.new(
6
+ "http://linkleaf.com/test_a.html",
7
+ File.read("doc/test_a.html")
8
+ )
9
+ @doc_b = Seep::Doc.new(
10
+ "http://examancer.com/gallery/",
11
+ File.read("doc/test_b.html")
12
+ )
13
+ @link_a = "https://exanotes.com/"
14
+ @link_b = "http://linkleaf.com/index.php?leaf=212"
15
+ @image_a = "http://l.yimg.com/a/i/us/we/52/34.gif"
16
+ @image_b = "http://examancer.com/pictures/polished/bolGallery/thumbnail_Good%20Catch%20Missy.jpg"
17
+ end
18
+
19
+ describe "#links" do
20
+ it "should contain absolute links" do
21
+ @doc_a.links.member?(@link_a).should be_true
22
+ end
23
+
24
+ it "should contain relative links, converted to absolute format" do
25
+ @doc_a.links.member?(@link_b).should be_true
26
+ end
27
+ end
28
+
29
+ describe "#images" do
30
+ it "should contain images from the test pages" do
31
+ @doc_a.images.member?(@image_a).should be_true
32
+ @doc_b.images.member?(@image_b).should be_true
33
+ end
34
+ end
35
+
36
+ describe "#urls" do
37
+ it "should contain both links and images" do
38
+ @doc_a.urls.member?(@link_b).should be_true
39
+ @doc_a.urls.member?(@image_a).should be_true
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,66 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Seep::Fetcher" do
4
+ before :all do
5
+ @link = Seep::Fetcher.open( "http://linkleaf.com/test_a.html" )
6
+ @jpg = Seep::Fetcher.open("http://examancer.com/pictures/funny/funny-pictures-ooooooo.jpg")
7
+ @png = Seep::Fetcher.open("http://examancer.com/pictures/funny/cnnpoll.png")
8
+ @gif = Seep::Fetcher.open("http://examancer.com/pictures/funny/lassie-shreds.gif")
9
+ @txt = Seep::Fetcher.open("http://examancer.com/remoteip.txt")
10
+ end
11
+
12
+ describe "#open" do
13
+ it "populates #size" do
14
+ @link.size.should == 56_607
15
+ end
16
+ it "populates #response_headers" do
17
+ @link.response_headers["Server"].should match(/Apache/)
18
+ end
19
+ it "throws exception when :max_file_size is exceeded" do
20
+ lambda do
21
+ Seep::Fetcher.open( @link.url, max_file_size: 25_000 )
22
+ end.should( raise_exception(Curl::Err::WriteError) )
23
+ end
24
+ end
25
+
26
+ describe "#size" do
27
+ it "should be the same as #body.length" do
28
+ @link.size.should == @link.body.length
29
+ end
30
+ end
31
+
32
+ describe "#dest_url" do
33
+ it "should show the url resulting from a redirect" do
34
+ Seep::Fetcher.open("http://reddit.com").dest_url.should == "http://www.reddit.com/"
35
+ end
36
+ end
37
+
38
+ describe "#content_type" do
39
+ it "should be nil before #open is called" do
40
+ link = Seep::Fetcher.new("http://linkleaf.com/")
41
+ link.content_type.should be_nil
42
+ end
43
+ it "returns the correct type for html" do
44
+ @link.content_type.should == "text/html"
45
+ end
46
+ it "returns the correct type for images" do
47
+ @jpg.content_type.should == "image/jpeg"
48
+ @png.content_type.should == "image/png"
49
+ @gif.content_type.should == "image/gif"
50
+ end
51
+ it "returns the correct type for text" do
52
+ @txt.content_type.should == "text/plain"
53
+ end
54
+ end
55
+
56
+ describe "#ext" do
57
+ it "should be blank when #open has not been called" do
58
+ Seep::Fetcher.new(@link.url)
59
+ end
60
+ it "should give proper extensions for images" do
61
+ @jpg.ext.should == ".jpg"
62
+ @png.ext.should == ".png"
63
+ @gif.ext.should == ".gif"
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,35 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Seep::Image" do
4
+ before :all do
5
+ @small = Seep::Image.new( File.read("doc/small.jpg") )
6
+ @bad = Seep::Image.new("Not an image")
7
+ end
8
+
9
+ describe "#dimensions" do
10
+ it "should provide the dimensions of an image" do
11
+ @small.dimensions.should == [320, 240]
12
+ end
13
+ end
14
+
15
+ describe "#width" do
16
+ it "should provide the width of an image" do
17
+ @small.width.should == 320
18
+ end
19
+ end
20
+
21
+ describe "#height" do
22
+ it "should provide the height of an image" do
23
+ @small.height.should == 240
24
+ end
25
+ end
26
+
27
+ describe "#valid?" do
28
+ it "should return true if the image is good" do
29
+ @small.valid?.should be_true
30
+ end
31
+ it "should return false if the image is bad" do
32
+ @bad.valid?.should be_false
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'seep'
5
+ require 'ruby-debug'
6
+
7
+ # Requires supporting files with custom matchers and macros, etc,
8
+ # in ./support/ and its subdirectories.
9
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
10
+
11
+ RSpec.configure do |config|
12
+
13
+ end