seep 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ require 'forwardable'
2
+ require 'uri'
3
+ require 'curb'
4
+ require 'nokogiri'
5
+ require 'gd2-ffij'
6
+
7
+ module Seep
8
+ require 'seep/fetcher'
9
+ require 'seep/doc'
10
+ require 'seep/image'
11
+
12
+ def self.fetch(*args)
13
+ Fetcher.open(*args)
14
+ end
15
+
16
+ def self.doc(*args)
17
+ fetch(*args).to_doc
18
+ end
19
+ end
@@ -0,0 +1,44 @@
1
+ class Seep::Doc
2
+ attr_reader :dom, :url
3
+
4
+ def initialize(url, html)
5
+ @url = url
6
+ @dom = Nokogiri::HTML.parse(html)
7
+ end
8
+
9
+ def links
10
+ @links ||= [].tap do |links|
11
+ @dom.search("a").each do |anchor|
12
+ link = compute_url( anchor.get_attribute("href") )
13
+ links << link unless link.nil? or link == url
14
+ end
15
+ end
16
+ end
17
+
18
+ def images
19
+ @images ||= [].tap do |images|
20
+ @dom.search("img").each do |image|
21
+ link = compute_url( image.get_attribute("src") )
22
+ images << link unless link.nil?
23
+ end
24
+ end
25
+ end
26
+
27
+ def urls
28
+ links + images
29
+ end
30
+
31
+ def compute_url( relative_url )
32
+ return nil if relative_url.nil?
33
+ url = begin
34
+ URI.join( @url, relative_url ).to_s
35
+ rescue
36
+ URI.join( @url, URI.escape(relative_url) ).to_s
37
+ end
38
+ url =~ /^http/i ? url : nil
39
+ end
40
+
41
+ def inspect
42
+ "#<Seep::Doc #{url} links: #{links.count}, images: #{images.count}>"
43
+ end
44
+ end
@@ -0,0 +1,107 @@
1
+ class Seep::Fetcher
2
+ AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
3
+
4
+ attr_reader :curb, :size, :body
5
+ attr_accessor :url, :max_file_size
6
+ attr_accessor :request_headers, :response_headers
7
+
8
+ def initialize(url, options = {})
9
+ self.url = url
10
+ self.request_headers = {}.tap do |head|
11
+ head['User-Agent'] = options[:user_agent] || AGENT
12
+ end
13
+ self.response_headers = {}
14
+ self.max_file_size = options[:max_file_size] || 1_048_576 # 1MB
15
+
16
+ @curb = Curl::Easy.new(url)
17
+ @curb.follow_location = true
18
+ @curb.max_redirects = options[:max_redirects] || 5
19
+
20
+ register_on_header!
21
+ register_on_body!
22
+ end
23
+
24
+ def content_type
25
+ response_headers['Content-Type']
26
+ end
27
+
28
+ def dest_url
29
+ curb.last_effective_url || url
30
+ end
31
+
32
+ def open(redirect = 0)
33
+ @body = ""; @size = 0
34
+ curb.headers = request_headers
35
+ curb.perform
36
+ self
37
+ end
38
+
39
+ def inspect
40
+ "#<Seep::Fetcher #{ content_type.nil? ? dest_url : content_type + ' ' + dest_url }>"
41
+ end
42
+
43
+ def export(path)
44
+ File.open(path, 'w') do |file|
45
+ file.write(@body)
46
+ end
47
+ end
48
+
49
+ def ext
50
+ case content_type
51
+ when "image/jpeg"; ".jpg"
52
+ when "image/png" ; ".png"
53
+ when "image/gif" ; ".gif"
54
+ when "text/html" ; ".html"
55
+ when "text/plain"; ".txt"
56
+ else; ""
57
+ end
58
+ end
59
+
60
+ def image?
61
+ (!! content_type =~ /^image/) and to_image.valid?
62
+ end
63
+
64
+ def to_image
65
+ @image ||= Seep::Image.new(body)
66
+ end
67
+
68
+ def doc?
69
+ content_type == "text/html"
70
+ end
71
+
72
+ def to_doc
73
+ @doc ||= Seep::Doc.new(url, body)
74
+ end
75
+
76
+ def self.open(url, options = {})
77
+ self.new(url, options).open
78
+ end
79
+
80
+ def register_on_header!
81
+ curb.on_header do |header|
82
+ key, value = header.split(":", 2)
83
+ unless key.nil? or value.nil?
84
+ key.strip!; value.strip!
85
+ @size = value.to_i if key == "Content-Length"
86
+ response_headers[key] = value
87
+ end
88
+ if @size > max_file_size
89
+ @size = -1
90
+ else
91
+ header.length
92
+ end
93
+ end
94
+ end
95
+
96
+ def register_on_body!
97
+ curb.on_body do |body|
98
+ @body += body
99
+ @size = @body.length
100
+ if @size > max_file_size
101
+ @size = -1
102
+ else
103
+ body.length
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,18 @@
1
+ class Seep::Image
2
+ extend Forwardable
3
+
4
+ attr_reader :image
5
+ def_delegators :image, :size, :width, :height
6
+ alias_method :dimensions, :size
7
+
8
+ def initialize(data)
9
+ @valid = begin
10
+ @image = GD2::Image.load( data ); true
11
+ rescue
12
+ false
13
+ end
14
+ end
15
+
16
+ def valid?; @valid; end
17
+
18
+ end
@@ -0,0 +1,85 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "seep"
8
+ s.version = "0.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Carl Zulauf"]
12
+ s.date = "2011-12-27"
13
+ s.description = "Collection of web spidering and downloading tools using redis, curl, and gd."
14
+ s.email = "carl@linkleaf.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README",
26
+ "Rakefile",
27
+ "doc/small.jpg",
28
+ "doc/test_a.html",
29
+ "doc/test_b.html",
30
+ "lib/seep.rb",
31
+ "lib/seep/doc.rb",
32
+ "lib/seep/fetcher.rb",
33
+ "lib/seep/image.rb",
34
+ "seep.gemspec",
35
+ "spec/a_spec.rb",
36
+ "spec/doc_spec.rb",
37
+ "spec/fetcher_spec.rb",
38
+ "spec/image_spec.rb",
39
+ "spec/spec_helper.rb",
40
+ "spider.rb",
41
+ "spidr_test.rb"
42
+ ]
43
+ s.homepage = "http://github.com/carlzulauf/seep"
44
+ s.licenses = ["MIT"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = "1.8.10"
47
+ s.summary = "web spidering/downloading tools"
48
+
49
+ if s.respond_to? :specification_version then
50
+ s.specification_version = 3
51
+
52
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
53
+ s.add_runtime_dependency(%q<redis-native_hash>, [">= 0"])
54
+ s.add_runtime_dependency(%q<gd2-ffij>, [">= 0"])
55
+ s.add_runtime_dependency(%q<curb>, [">= 0"])
56
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
57
+ s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
58
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
59
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
61
+ s.add_development_dependency(%q<rcov>, [">= 0"])
62
+ else
63
+ s.add_dependency(%q<redis-native_hash>, [">= 0"])
64
+ s.add_dependency(%q<gd2-ffij>, [">= 0"])
65
+ s.add_dependency(%q<curb>, [">= 0"])
66
+ s.add_dependency(%q<nokogiri>, [">= 0"])
67
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
68
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
69
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
70
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
71
+ s.add_dependency(%q<rcov>, [">= 0"])
72
+ end
73
+ else
74
+ s.add_dependency(%q<redis-native_hash>, [">= 0"])
75
+ s.add_dependency(%q<gd2-ffij>, [">= 0"])
76
+ s.add_dependency(%q<curb>, [">= 0"])
77
+ s.add_dependency(%q<nokogiri>, [">= 0"])
78
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
79
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
80
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
81
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
82
+ s.add_dependency(%q<rcov>, [">= 0"])
83
+ end
84
+ end
85
+
@@ -0,0 +1,7 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "stuff" do
4
+ it "should pass" do
5
+ true.should be_true
6
+ end
7
+ end
@@ -0,0 +1,42 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Seep::Doc" do
4
+ before :all do
5
+ @doc_a = Seep::Doc.new(
6
+ "http://linkleaf.com/test_a.html",
7
+ File.read("doc/test_a.html")
8
+ )
9
+ @doc_b = Seep::Doc.new(
10
+ "http://examancer.com/gallery/",
11
+ File.read("doc/test_b.html")
12
+ )
13
+ @link_a = "https://exanotes.com/"
14
+ @link_b = "http://linkleaf.com/index.php?leaf=212"
15
+ @image_a = "http://l.yimg.com/a/i/us/we/52/34.gif"
16
+ @image_b = "http://examancer.com/pictures/polished/bolGallery/thumbnail_Good%20Catch%20Missy.jpg"
17
+ end
18
+
19
+ describe "#links" do
20
+ it "should contain absolute links" do
21
+ @doc_a.links.member?(@link_a).should be_true
22
+ end
23
+
24
+ it "should contain relative links, converted to absolute format" do
25
+ @doc_a.links.member?(@link_b).should be_true
26
+ end
27
+ end
28
+
29
+ describe "#images" do
30
+ it "should contain images from the test pages" do
31
+ @doc_a.images.member?(@image_a).should be_true
32
+ @doc_b.images.member?(@image_b).should be_true
33
+ end
34
+ end
35
+
36
+ describe "#urls" do
37
+ it "should contain both links and images" do
38
+ @doc_a.urls.member?(@link_b).should be_true
39
+ @doc_a.urls.member?(@image_a).should be_true
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,66 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Seep::Fetcher" do
4
+ before :all do
5
+ @link = Seep::Fetcher.open( "http://linkleaf.com/test_a.html" )
6
+ @jpg = Seep::Fetcher.open("http://examancer.com/pictures/funny/funny-pictures-ooooooo.jpg")
7
+ @png = Seep::Fetcher.open("http://examancer.com/pictures/funny/cnnpoll.png")
8
+ @gif = Seep::Fetcher.open("http://examancer.com/pictures/funny/lassie-shreds.gif")
9
+ @txt = Seep::Fetcher.open("http://examancer.com/remoteip.txt")
10
+ end
11
+
12
+ describe "#open" do
13
+ it "populates #size" do
14
+ @link.size.should == 56_607
15
+ end
16
+ it "populates #response_headers" do
17
+ @link.response_headers["Server"].should match(/Apache/)
18
+ end
19
+ it "throws exception when :max_file_size is exceeded" do
20
+ lambda do
21
+ Seep::Fetcher.open( @link.url, max_file_size: 25_000 )
22
+ end.should( raise_exception(Curl::Err::WriteError) )
23
+ end
24
+ end
25
+
26
+ describe "#size" do
27
+ it "should be the same as #body.length" do
28
+ @link.size.should == @link.body.length
29
+ end
30
+ end
31
+
32
+ describe "#dest_url" do
33
+ it "should show the url resulting from a redirect" do
34
+ Seep::Fetcher.open("http://reddit.com").dest_url.should == "http://www.reddit.com/"
35
+ end
36
+ end
37
+
38
+ describe "#content_type" do
39
+ it "should be nil before #open is called" do
40
+ link = Seep::Fetcher.new("http://linkleaf.com/")
41
+ link.content_type.should be_nil
42
+ end
43
+ it "returns the correct type for html" do
44
+ @link.content_type.should == "text/html"
45
+ end
46
+ it "returns the correct type for images" do
47
+ @jpg.content_type.should == "image/jpeg"
48
+ @png.content_type.should == "image/png"
49
+ @gif.content_type.should == "image/gif"
50
+ end
51
+ it "returns the correct type for text" do
52
+ @txt.content_type.should == "text/plain"
53
+ end
54
+ end
55
+
56
+ describe "#ext" do
57
+ it "should be blank when #open has not been called" do
58
+ Seep::Fetcher.new(@link.url)
59
+ end
60
+ it "should give proper extensions for images" do
61
+ @jpg.ext.should == ".jpg"
62
+ @png.ext.should == ".png"
63
+ @gif.ext.should == ".gif"
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,35 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Seep::Image" do
4
+ before :all do
5
+ @small = Seep::Image.new( File.read("doc/small.jpg") )
6
+ @bad = Seep::Image.new("Not an image")
7
+ end
8
+
9
+ describe "#dimensions" do
10
+ it "should provide the dimensions of an image" do
11
+ @small.dimensions.should == [320, 240]
12
+ end
13
+ end
14
+
15
+ describe "#width" do
16
+ it "should provide the width of an image" do
17
+ @small.width.should == 320
18
+ end
19
+ end
20
+
21
+ describe "#height" do
22
+ it "should provide the height of an image" do
23
+ @small.height.should == 240
24
+ end
25
+ end
26
+
27
+ describe "#valid?" do
28
+ it "should return true if the image is good" do
29
+ @small.valid?.should be_true
30
+ end
31
+ it "should return false if the image is bad" do
32
+ @bad.valid?.should be_false
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'seep'
5
+ require 'ruby-debug'
6
+
7
+ # Requires supporting files with custom matchers and macros, etc,
8
+ # in ./support/ and its subdirectories.
9
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
10
+
11
+ RSpec.configure do |config|
12
+
13
+ end