scrape 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ .DS_Store
2
+ pkg/
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'webmock', '~> 1.8.7'
7
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scrape (0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ addressable (2.2.8)
10
+ crack (0.3.1)
11
+ nokogiri (1.5.5)
12
+ webmock (1.8.7)
13
+ addressable (>= 2.2.7)
14
+ crack (>= 0.1.7)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ nokogiri (~> 1.5.5)
21
+ scrape!
22
+ webmock (~> 1.8.7)
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Marty Zalega
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # Scrape
2
+
3
+ A really simple web scraper.
4
+
5
+ ```ruby
6
+ site "https://github.com/explore" # The site to scrape. Will be used as the base address.
7
+
8
+ match /evilmarty/ do |doc| # A regexp/string/proc to match against the current url.
9
+
10
+ doc.search('a[href]') # The nokogiri document of the contents of the current url.
11
+
12
+ end
13
+
14
+ site "http://www.tumblr.com" # Can define multiple sites
15
+
16
+ match "/tagged" do |doc|
17
+ # Do what ever we want with the document.
18
+ end
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ After creating a `Scrapefile` simple run:
24
+
25
+ ```
26
+ scrape -f [FILE]
27
+ ```
28
+
29
+ If no scapefile is specified then `Scrapefile` is used by default.
30
+
31
+ ## Installation
32
+
33
+ Simply install the gem
34
+
35
+ ```
36
+ gem install scrape
37
+ ```
38
+
39
+ ## TODO
40
+
41
+ * Fix bugs
42
+ * Add support for Robots.txt
43
+ * Depth limiting
44
+ * Better docs
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList['test/**/*.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ task :default => "test"
data/bin/scrape ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path('../../lib', __FILE__)
4
+
5
+ require "scrape"
6
+ require "scrape/cli"
7
+
8
+ Scrape::CLI.new(File.basename($0)).run(ARGV)
@@ -0,0 +1,7 @@
1
+ site "https://github.com/explore"
2
+
3
+ match "explore" do |doc|
4
+ doc.css('ol.ranked-repositories li h3').each do |node|
5
+ puts node.inner_text.strip
6
+ end
7
+ end
@@ -0,0 +1,5 @@
1
+ site "http://www.google.com/search?q=cats"
2
+
3
+ match "/search" do |doc|
4
+ puts "Page title: #{doc.css('title').inner_text}"
5
+ end
@@ -0,0 +1,57 @@
1
+ class Scrape::Application
2
+ attr_reader :scrapefile, :loader, :sites, :history
3
+
4
+ def initialize scrapefile, loader = Scrape::DefaultLoader.new
5
+ @scrapefile = File.expand_path scrapefile
6
+ @loader = loader
7
+ @sites = {}
8
+ @queue = []
9
+ @history = []
10
+ end
11
+
12
+ def run
13
+ load_scrapefile
14
+
15
+ while url = @queue.shift
16
+ Scrape.logger.info "Loading: #{url}..."
17
+ @history << url
18
+ if site = self[url]
19
+ if urls = site.parse(url)
20
+ enqueue *urls
21
+ Scrape.logger.info "Found #{urls.length} urls."
22
+ else
23
+ Scrape.logger.info "Done."
24
+ end
25
+ else
26
+ Scrape.logger.info "Not defined."
27
+ end
28
+ end
29
+ end
30
+
31
+ def reset
32
+ @history = []
33
+ @queue = sites.values.map{|site| site.url.to_s }
34
+ end
35
+
36
+ def queue
37
+ @queue.dup
38
+ end
39
+
40
+ def enqueue *urls
41
+ urls.flatten.each do |url|
42
+ @queue << url unless @history.include?(url) || @queue.include?(url)
43
+ end
44
+ end
45
+
46
+ def [] url
47
+ @sites.values.detect{|site| site.url < url }
48
+ end
49
+
50
+ def load_scrapefile
51
+ return if @scrapefile_loaded
52
+ result = loader.load(scrapefile)
53
+ @sites.update result if result.is_a? Hash
54
+ reset
55
+ @scrapefile_loaded = true
56
+ end
57
+ end
data/lib/scrape/cli.rb ADDED
@@ -0,0 +1,39 @@
1
+ require "optparse"
2
+
3
+ class Scrape::CLI
4
+ attr_reader :command, :pwd
5
+
6
+ def initialize command, pwd = Dir.pwd
7
+ @command, @pwd = command, pwd
8
+ end
9
+
10
+ def run argv
11
+ options = {:file => File.join(pwd, 'Scrapefile')}
12
+ opts = OptionParser.new do |opts|
13
+ opts.banner = "Scrape #{Scrape::VERSION} - Usage: #{command} [options]"
14
+ opts.separator ""
15
+ opts.separator "Specific options:"
16
+
17
+ opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
18
+ options[:file] = File.expand_path file
19
+ end
20
+ opts.on_tail "-h", "--help", "Show this message" do
21
+ puts opts
22
+ exit
23
+ end
24
+ opts.on_tail "-v", "--version", "Show version" do
25
+ puts Scrape::VERSION
26
+ exit
27
+ end
28
+ end
29
+ opts.parse argv
30
+
31
+ if File.exists? options[:file]
32
+ Scrape::Application.new(options[:file]).run
33
+ else
34
+ puts "#{command} aborted!"
35
+ puts "No Scrapefile found"
36
+ exit -1
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,19 @@
1
+ class Scrape::DefaultLoader
2
+ def load path
3
+ path = File.expand_path path
4
+ sites = {}
5
+
6
+ sandbox = Sandbox.new sites
7
+ sandbox.instance_eval File.read(path), path
8
+
9
+ sites
10
+ end
11
+
12
+ class Sandbox
13
+ include Scrape::DSL
14
+
15
+ def initialize sites
16
+ @sites = sites
17
+ end
18
+ end
19
+ end
data/lib/scrape/dsl.rb ADDED
@@ -0,0 +1,13 @@
1
+ module Scrape::DSL
2
+ def site *urls
3
+ @_sites ||= {}
4
+ @sites ||= {}
5
+ @current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
6
+ end
7
+
8
+ def match matcher, &proc
9
+ raise ArgumentError.new("site must be set") unless defined? @current_sites
10
+ matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
11
+ matches.size == 1 ? matches.first : matches
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ class Scrape::Match
2
+ attr_reader :matcher
3
+
4
+ def initialize matcher, &proc
5
+ @matcher, @proc = matcher, proc
6
+ raise ArgumentError.new("Match block expects one argument") if proc.arity != 1
7
+ end
8
+
9
+ def invoke doc
10
+ @proc.call doc
11
+ end
12
+
13
+ def =~ url
14
+ case @matcher
15
+ when String
16
+ url.to_s.include? @matcher
17
+ when Regexp
18
+ url.to_s =~ @matcher
19
+ when Proc
20
+ @matcher.call url
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ require 'nokogiri'
2
+
3
+ class Scrape::Site
4
+ attr_reader :url, :matches
5
+
6
+ def initialize url
7
+ @url = Scrape::URI.new url
8
+ @url.query = nil
9
+ @url.fragment = nil
10
+ @matches = []
11
+ end
12
+
13
+ def add_match matcher, &proc
14
+ match = Scrape::Match.new(matcher, &proc)
15
+ @matches << match
16
+ match
17
+ end
18
+
19
+ def parse url
20
+ url = self.url + url
21
+ doc = Nokogiri::HTML url.open
22
+
23
+ @matches.each{|match| match.invoke doc if match =~ url }
24
+
25
+ urls = doc.css("a[href]").map do |node|
26
+ href = self.url + node['href']
27
+ self.url < href ? href : nil
28
+ end
29
+ urls.compact
30
+ end
31
+ end
data/lib/scrape/uri.rb ADDED
@@ -0,0 +1,58 @@
1
+ require 'uri'
2
+ require 'open-uri'
3
+
4
+ class Scrape::URI
5
+ def initialize uri = nil
6
+ @uri = case uri
7
+ when URI then uri.clone
8
+ when NilClass then URI.new
9
+ else URI.parse uri.to_s
10
+ end
11
+ end
12
+
13
+ %w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
14
+ class_eval <<-EOT, __FILE__, __LINE__ + 1
15
+ def #{method_name}
16
+ @uri.#{method_name}
17
+ end
18
+ EOT
19
+ end
20
+
21
+ %w[fragment host hostname password path port query scheme user].each do |method_name|
22
+ class_eval <<-EOT, __FILE__, __LINE__ + 1
23
+ def #{method_name}= value
24
+ @uri.#{method_name} = value
25
+ end
26
+ EOT
27
+ end
28
+
29
+ def + url
30
+ return clone if self == url
31
+ relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
32
+ uri = self.class.new @uri.merge(url)
33
+ uri.path = "#{@uri.path}#{uri.path}" if relative
34
+ uri
35
+ end
36
+
37
+ def < url
38
+ url[0, length] == to_s
39
+ end
40
+
41
+ def [] *args
42
+ to_s[*args]
43
+ end
44
+
45
+ def == url
46
+ to_s == url.to_s
47
+ end
48
+
49
+ def length
50
+ to_s.length
51
+ end
52
+ alias_method :size, :length
53
+
54
+ def open headers = {}, &block
55
+ headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
56
+ super(to_s, headers, &block).read
57
+ end
58
+ end
@@ -0,0 +1,3 @@
1
+ module Scrape
2
+ VERSION = '0.1' unless defined? ::Scrape::VERSION
3
+ end
data/lib/scrape.rb ADDED
@@ -0,0 +1,36 @@
1
+ require "rubygems"
2
+ require "logger"
3
+ require "bundler/setup"
4
+
5
+ module Scrape
6
+ require 'scrape/version'
7
+
8
+ autoload 'Application', 'scrape/application'
9
+ autoload 'Site', 'scrape/site'
10
+ autoload 'Match', 'scrape/match'
11
+ autoload 'DefaultLoader', 'scrape/default_loader'
12
+ autoload 'DSL', 'scrape/dsl'
13
+ autoload 'URI', 'scrape/uri'
14
+
15
+ class ScrapeFileNotFound < Exception; end
16
+
17
+ class << self
18
+ attr_writer :user_agent
19
+
20
+ def user_agent
21
+ @user_agent || "Scrape/#{Scrape::VERSION}"
22
+ end
23
+
24
+ def logger
25
+ @logger ||= Logger.new STDOUT
26
+ end
27
+
28
+ def logger= log
29
+ @logger = log
30
+ end
31
+
32
+ def load_scrapefile path
33
+ Application.new path
34
+ end
35
+ end
36
+ end
data/scrape.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/scrape/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "scrape"
6
+ s.version = Scrape::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Marty Zalega"]
9
+ s.email = ["evilmarty@gmail.com"]
10
+ s.homepage = "http://github.com/evilmarty/scrape"
11
+ s.summary = %q{A really simple web scraper}
12
+ s.description = %q{An easy to use utility to scrape websites using a DSL similar to rake.}
13
+
14
+ s.rubyforge_project = "scrape"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "nokogiri", "~> 1.5.5"
22
+ end
@@ -0,0 +1,5 @@
1
+ site "http://example.com"
2
+
3
+ match "test" do |doc|
4
+ "boo!"
5
+ end
@@ -0,0 +1 @@
1
+ site "http://example.com"
@@ -0,0 +1,3 @@
1
+ match "test" do |doc|
2
+ "boo!"
3
+ end
@@ -0,0 +1,19 @@
1
+ $: << File.expand_path('../../lib', __FILE__)
2
+
3
+ require "minitest/autorun"
4
+ require "webmock/minitest"
5
+
6
+ require "bundler/setup"
7
+ Bundler.setup(:default, :test)
8
+
9
+ require "scrape"
10
+
11
+ class Scrape::TestCase < MiniTest::Unit::TestCase
12
+ class << self
13
+ def test name, &block
14
+ method_name = name.gsub /[^a-z0-9_]+/i, '_'
15
+ define_method "test_#{method_name}", &block
16
+ end
17
+ private :test
18
+ end
19
+ end
@@ -0,0 +1,72 @@
1
+ require "test_helper"
2
+
3
+ class ApplicationTest < Scrape::TestCase
4
+ SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
5
+
6
+ test "#load_scrapefile should parse the specified file" do
7
+ filepath = File.join(SUPPORT_FILES, 'test1.scrape')
8
+ app = Scrape::Application.new(filepath)
9
+ assert app.load_scrapefile, "scrape file failed to load"
10
+ assert_equal ["http://example.com"], app.sites.keys
11
+ end
12
+
13
+ test "#load_scrapefile should return nil when already loaded" do
14
+ filepath = File.join(SUPPORT_FILES, 'test1.scrape')
15
+ app = Scrape::Application.new(filepath)
16
+ assert app.load_scrapefile, "scrape file failed to load"
17
+ refute app.load_scrapefile, "scrape file should not have loaded again"
18
+ end
19
+
20
+ test "#[] should return the site that matches the given url" do
21
+ site1 = Scrape::Site.new "http://example.com"
22
+ site2 = Scrape::Site.new "http://example.org"
23
+ app = Scrape::Application.new(".")
24
+ app.sites.update site1.to_s => site1, site2.to_s => site2
25
+ assert_equal site1, app["http://example.com"]
26
+ end
27
+
28
+ test "#[] should return the site that is relative to the given url" do
29
+ site1 = Scrape::Site.new "http://example.com"
30
+ site2 = Scrape::Site.new "http://example.org"
31
+ app = Scrape::Application.new(".")
32
+ app.sites.update site1.to_s => site1, site2.to_s => site2
33
+ assert_equal site1, app["http://example.com/test"]
34
+ end
35
+
36
+ test "#[] should return nil when no site matches the given url" do
37
+ site1 = Scrape::Site.new "http://example.com"
38
+ site2 = Scrape::Site.new "http://example.org"
39
+ app = Scrape::Application.new(".")
40
+ app.sites.update site1.to_s => site1, site2.to_s => site2
41
+ assert_nil app["http://example.net"]
42
+ end
43
+
44
+ test "#reset should enqueue the sites that have been defined" do
45
+ site1 = Scrape::Site.new "http://example.com"
46
+ site2 = Scrape::Site.new "http://example.org"
47
+ app = Scrape::Application.new(".")
48
+ app.sites.update site1.to_s => site1, site2.to_s => site2
49
+ app.reset
50
+ assert_equal ["http://example.com", "http://example.org"], app.queue
51
+ end
52
+
53
+ test "#run should load the specified file" do
54
+ filepath = File.join(SUPPORT_FILES, 'test1.scrape')
55
+ test_loader = MiniTest::Mock.new
56
+ test_loader.expect :load, nil, [filepath]
57
+ Scrape::Application.new(filepath, test_loader).run
58
+ assert test_loader.verify, "loader did not receive file"
59
+ end
60
+
61
+ test "#enqueue should add the given url to the queue" do
62
+ app = Scrape::Application.new(".")
63
+ app.enqueue "http://example.com"
64
+ assert_equal ["http://example.com"], app.queue
65
+ end
66
+
67
+ test "#enqueue should not add the given to the queue when it already is added" do
68
+ app = Scrape::Application.new(".")
69
+ 3.times{ app.enqueue "http://example.com" }
70
+ assert_equal ["http://example.com"], app.queue
71
+ end
72
+ end
@@ -0,0 +1,25 @@
1
+ require "test_helper"
2
+
3
+ class DefaultLoaderTest < Scrape::TestCase
4
+ SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
5
+
6
+ test "#load should return sites parsed from the specified file" do
7
+ loader = Scrape::DefaultLoader.new
8
+ sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
9
+ assert_equal ["http://example.com"], sites.keys
10
+ assert_instance_of Scrape::Site, sites.values[0]
11
+ end
12
+
13
+ test "#load should return an empty hash when no matches have been defined" do
14
+ loader = Scrape::DefaultLoader.new
15
+ sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
16
+ assert_equal Hash.new, sites
17
+ end
18
+
19
+ test "#load should raise an error when no site is defined" do
20
+ loader = Scrape::DefaultLoader.new
21
+ assert_raises ArgumentError do
22
+ loader.load File.join(SUPPORT_FILES, "test3.scrape")
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,54 @@
1
+ require "test_helper"
2
+
3
+ class MatchTest < Scrape::TestCase
4
+ test "#initialize should raise error when proc's arity isn't one" do
5
+ assert_raises ArgumentError do
6
+ Scrape::Match.new("test"){ "no arguments" }
7
+ end
8
+ end
9
+
10
+ test "#invoke should call the proc" do
11
+ ok = false
12
+ match = Scrape::Match.new("test"){|doc| ok = true }
13
+ match.invoke nil
14
+ assert ok, "Proc was not called"
15
+ end
16
+
17
+ test "#invoke should pass the document to the proc" do
18
+ doc = "yay"
19
+ ok = false
20
+ match = Scrape::Match.new("test"){|d| ok = (doc == d) }
21
+ match.invoke doc
22
+ assert ok, "Document was not passed into the proc"
23
+ end
24
+
25
+ test "#=~ should return true when contains string" do
26
+ match = Scrape::Match.new("bar"){|doc|}
27
+ assert match =~ "foobar", "Expected true"
28
+ end
29
+
30
+ test "#=~ should return false when doesn't contains string" do
31
+ match = Scrape::Match.new("bar"){|doc|}
32
+ refute match =~ "ponies", "Expected false"
33
+ end
34
+
35
+ test "#=~ should return true when matches regexp" do
36
+ match = Scrape::Match.new(/bar/){|doc|}
37
+ assert match =~ "foobar", "Expected true"
38
+ end
39
+
40
+ test "#=~ should return false when doesn't match regexp" do
41
+ match = Scrape::Match.new(/bar/){|doc|}
42
+ refute match =~ "ponies", "Expected false"
43
+ end
44
+
45
+ test "#=~ should return true when proc is truthy" do
46
+ match = Scrape::Match.new(lambda{|url| true }){|doc|}
47
+ assert match =~ "ponies", "Expected true"
48
+ end
49
+
50
+ test "#=~ should return false when proc is falsy" do
51
+ match = Scrape::Match.new(lambda{|url| false }){|doc|}
52
+ refute match =~ "ponies", "Expected false"
53
+ end
54
+ end
@@ -0,0 +1,75 @@
1
+ require "test_helper"
2
+
3
+ class SiteTest < Scrape::TestCase
4
+ test "#add_match should create a Match object and be added to the collection" do
5
+ site = Scrape::Site.new "http://www.example.com"
6
+ match = site.add_match("/test") { |doc| }
7
+ assert_instance_of Scrape::Match, match
8
+ end
9
+
10
+ test "#parse should return absolute urls that match the site's url" do
11
+ stub_request(:get, "http://www.example.com/test").
12
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
13
+ to_return(:status => 200, :body => <<-HTML)
14
+ <html>
15
+ <body>
16
+ <a href="http://www.example.com/link1.html">link 1</a>
17
+ <a href="http://example.com/link2.html">link 2</a>
18
+ <a href="http://example.org/link3.html">link 3</a>
19
+ </body>
20
+ </html>
21
+ HTML
22
+
23
+ site = Scrape::Site.new "http://www.example.com"
24
+ assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
25
+ end
26
+
27
+ test "#parse should return relative urls to the site" do
28
+ stub_request(:get, "http://www.example.com/test").
29
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
30
+ to_return(:status => 200, :body => <<-HTML)
31
+ <html>
32
+ <body>
33
+ <a href="link1.html">link 1</a>
34
+ </body>
35
+ </html>
36
+ HTML
37
+
38
+ site = Scrape::Site.new "http://www.example.com"
39
+ assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
40
+ end
41
+
42
+ test "#parse should return no urls" do
43
+ stub_request(:get, "http://www.example.com/test").
44
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
45
+ to_return(:status => 200, :body => <<-HTML)
46
+ <html>
47
+ <body>
48
+ <a href="/link1.html">link 1</a>
49
+ </body>
50
+ </html>
51
+ HTML
52
+
53
+ site = Scrape::Site.new "http://www.example.com/test"
54
+ assert_equal [], site.parse("/test")
55
+ end
56
+
57
+ test "#parse should invoke Match when hit" do
58
+ stub_request(:get, "http://www.example.com/test").
59
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
60
+ to_return(:status => 200, :body => <<-HTML)
61
+ <html>
62
+ <body>
63
+ <a href="link1.html">link 1</a>
64
+ </body>
65
+ </html>
66
+ HTML
67
+
68
+ ok = false
69
+ site = Scrape::Site.new "http://www.example.com"
70
+ site.add_match(/test/){|doc| ok = true }
71
+ site.parse "/test"
72
+
73
+ assert ok, "Match was not invoked"
74
+ end
75
+ end
@@ -0,0 +1,52 @@
1
+ require "test_helper"
2
+
3
+ class URITest < Scrape::TestCase
4
+ {fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
5
+ test "##{method_name} should return value" do
6
+ uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
7
+ assert_equal value, uri.send(method_name)
8
+ end
9
+ end
10
+
11
+ test "#open should return the contents at the url" do
12
+ stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
13
+
14
+ uri = Scrape::URI.new "http://www.example.com"
15
+ assert_equal "Howdie", uri.open
16
+ end
17
+
18
+ test "#+ should return a URI with the specified path" do
19
+ uri1 = Scrape::URI.new "http://www.example.com"
20
+ uri2 = uri1 + "/bar"
21
+ assert_equal "http://www.example.com/bar", uri2.to_s
22
+ end
23
+
24
+ test "#+ should return a URI overwriting with the specified path" do
25
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
26
+ uri2 = uri1 + "/bar"
27
+ assert_equal "http://www.example.com/bar", uri2.to_s
28
+ end
29
+
30
+ test "#+ should return a URI with the specified path appended" do
31
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
32
+ uri2 = uri1 + "bar"
33
+ assert_equal "http://www.example.com/foo/bar", uri2.to_s
34
+ end
35
+
36
+ test "#+ should return a URI from the absolute url" do
37
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
38
+ uri2 = uri1 + "http://www.example.com/bar"
39
+ assert_equal "http://www.example.com/bar", uri2.to_s
40
+ end
41
+
42
+ test "#+ should return a URI appended from the absolute url" do
43
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
44
+ uri2 = uri1 + "http://www.example.com/foo/bar"
45
+ assert_equal "http://www.example.com/foo/bar", uri2.to_s
46
+ end
47
+
48
+ test "#< should return true when specified url is greater" do
49
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
50
+ assert uri1 < "http://www.example.com/foo/bar"
51
+ end
52
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrape
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Marty Zalega
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.5
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.5
30
+ description: An easy to use utility to scrape websites using a DSL similar to rake.
31
+ email:
32
+ - evilmarty@gmail.com
33
+ executables:
34
+ - scrape
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - .gitignore
39
+ - Gemfile
40
+ - Gemfile.lock
41
+ - LICENSE
42
+ - README.md
43
+ - Rakefile
44
+ - bin/scrape
45
+ - examples/github.scrape
46
+ - examples/google.scrape
47
+ - lib/scrape.rb
48
+ - lib/scrape/application.rb
49
+ - lib/scrape/cli.rb
50
+ - lib/scrape/default_loader.rb
51
+ - lib/scrape/dsl.rb
52
+ - lib/scrape/match.rb
53
+ - lib/scrape/site.rb
54
+ - lib/scrape/uri.rb
55
+ - lib/scrape/version.rb
56
+ - scrape.gemspec
57
+ - test/support/test1.scrape
58
+ - test/support/test2.scrape
59
+ - test/support/test3.scrape
60
+ - test/test_helper.rb
61
+ - test/unit/application_test.rb
62
+ - test/unit/default_loader_test.rb
63
+ - test/unit/match_test.rb
64
+ - test/unit/site_test.rb
65
+ - test/unit/uri_test.rb
66
+ homepage: http://github.com/evilmarty/scrape
67
+ licenses: []
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ! '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project: scrape
86
+ rubygems_version: 1.8.23
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: A really simple web scraper
90
+ test_files: []