scrape 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ .DS_Store
2
+ pkg/
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'webmock', '~> 1.8.7'
7
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scrape (0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ addressable (2.2.8)
10
+ crack (0.3.1)
11
+ nokogiri (1.5.5)
12
+ webmock (1.8.7)
13
+ addressable (>= 2.2.7)
14
+ crack (>= 0.1.7)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ nokogiri (~> 1.5.5)
21
+ scrape!
22
+ webmock (~> 1.8.7)
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Marty Zalega
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # Scrape
2
+
3
+ A really simple web scraper.
4
+
5
+ ```ruby
6
+ site "https://github.com/explore" # The site to scrape. Will be used as the base address.
7
+
8
+ match /evilmarty/ do |doc| # A regexp/string/proc to match against the current url.
9
+
10
+ doc.search('a[href]') # The nokogiri document of the contents of the current url.
11
+
12
+ end
13
+
14
+ site "http://www.tumblr.com" # Can define multiple sites
15
+
16
+ match "/tagged" do |doc|
17
+ # Do what ever we want with the document.
18
+ end
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ After creating a `Scrapefile` simple run:
24
+
25
+ ```
26
+ scrape -f [FILE]
27
+ ```
28
+
29
+ If no scapefile is specified then `Scrapefile` is used by default.
30
+
31
+ ## Installation
32
+
33
+ Simply install the gem
34
+
35
+ ```
36
+ gem install scrape
37
+ ```
38
+
39
+ ## TODO
40
+
41
+ * Fix bugs
42
+ * Add support for Robots.txt
43
+ * Depth limiting
44
+ * Better docs
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList['test/**/*.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ task :default => "test"
data/bin/scrape ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path('../../lib', __FILE__)
4
+
5
+ require "scrape"
6
+ require "scrape/cli"
7
+
8
+ Scrape::CLI.new(File.basename($0)).run(ARGV)
@@ -0,0 +1,7 @@
1
+ site "https://github.com/explore"
2
+
3
+ match "explore" do |doc|
4
+ doc.css('ol.ranked-repositories li h3').each do |node|
5
+ puts node.inner_text.strip
6
+ end
7
+ end
@@ -0,0 +1,5 @@
1
+ site "http://www.google.com/search?q=cats"
2
+
3
+ match "/search" do |doc|
4
+ puts "Page title: #{doc.css('title').inner_text}"
5
+ end
@@ -0,0 +1,57 @@
1
+ class Scrape::Application
2
+ attr_reader :scrapefile, :loader, :sites, :history
3
+
4
+ def initialize scrapefile, loader = Scrape::DefaultLoader.new
5
+ @scrapefile = File.expand_path scrapefile
6
+ @loader = loader
7
+ @sites = {}
8
+ @queue = []
9
+ @history = []
10
+ end
11
+
12
+ def run
13
+ load_scrapefile
14
+
15
+ while url = @queue.shift
16
+ Scrape.logger.info "Loading: #{url}..."
17
+ @history << url
18
+ if site = self[url]
19
+ if urls = site.parse(url)
20
+ enqueue *urls
21
+ Scrape.logger.info "Found #{urls.length} urls."
22
+ else
23
+ Scrape.logger.info "Done."
24
+ end
25
+ else
26
+ Scrape.logger.info "Not defined."
27
+ end
28
+ end
29
+ end
30
+
31
+ def reset
32
+ @history = []
33
+ @queue = sites.values.map{|site| site.url.to_s }
34
+ end
35
+
36
+ def queue
37
+ @queue.dup
38
+ end
39
+
40
+ def enqueue *urls
41
+ urls.flatten.each do |url|
42
+ @queue << url unless @history.include?(url) || @queue.include?(url)
43
+ end
44
+ end
45
+
46
+ def [] url
47
+ @sites.values.detect{|site| site.url < url }
48
+ end
49
+
50
+ def load_scrapefile
51
+ return if @scrapefile_loaded
52
+ result = loader.load(scrapefile)
53
+ @sites.update result if result.is_a? Hash
54
+ reset
55
+ @scrapefile_loaded = true
56
+ end
57
+ end
data/lib/scrape/cli.rb ADDED
@@ -0,0 +1,39 @@
1
+ require "optparse"
2
+
3
+ class Scrape::CLI
4
+ attr_reader :command, :pwd
5
+
6
+ def initialize command, pwd = Dir.pwd
7
+ @command, @pwd = command, pwd
8
+ end
9
+
10
+ def run argv
11
+ options = {:file => File.join(pwd, 'Scrapefile')}
12
+ opts = OptionParser.new do |opts|
13
+ opts.banner = "Scrape #{Scrape::VERSION} - Usage: #{command} [options]"
14
+ opts.separator ""
15
+ opts.separator "Specific options:"
16
+
17
+ opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
18
+ options[:file] = File.expand_path file
19
+ end
20
+ opts.on_tail "-h", "--help", "Show this message" do
21
+ puts opts
22
+ exit
23
+ end
24
+ opts.on_tail "-v", "--version", "Show version" do
25
+ puts Scrape::VERSION
26
+ exit
27
+ end
28
+ end
29
+ opts.parse argv
30
+
31
+ if File.exists? options[:file]
32
+ Scrape::Application.new(options[:file]).run
33
+ else
34
+ puts "#{command} aborted!"
35
+ puts "No Scrapefile found"
36
+ exit -1
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,19 @@
1
+ class Scrape::DefaultLoader
2
+ def load path
3
+ path = File.expand_path path
4
+ sites = {}
5
+
6
+ sandbox = Sandbox.new sites
7
+ sandbox.instance_eval File.read(path), path
8
+
9
+ sites
10
+ end
11
+
12
+ class Sandbox
13
+ include Scrape::DSL
14
+
15
+ def initialize sites
16
+ @sites = sites
17
+ end
18
+ end
19
+ end
data/lib/scrape/dsl.rb ADDED
@@ -0,0 +1,13 @@
1
+ module Scrape::DSL
2
+ def site *urls
3
+ @_sites ||= {}
4
+ @sites ||= {}
5
+ @current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
6
+ end
7
+
8
+ def match matcher, &proc
9
+ raise ArgumentError.new("site must be set") unless defined? @current_sites
10
+ matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
11
+ matches.size == 1 ? matches.first : matches
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ class Scrape::Match
2
+ attr_reader :matcher
3
+
4
+ def initialize matcher, &proc
5
+ @matcher, @proc = matcher, proc
6
+ raise ArgumentError.new("Match block expects one argument") if proc.arity != 1
7
+ end
8
+
9
+ def invoke doc
10
+ @proc.call doc
11
+ end
12
+
13
+ def =~ url
14
+ case @matcher
15
+ when String
16
+ url.to_s.include? @matcher
17
+ when Regexp
18
+ url.to_s =~ @matcher
19
+ when Proc
20
+ @matcher.call url
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,31 @@
1
+ require 'nokogiri'
2
+
3
+ class Scrape::Site
4
+ attr_reader :url, :matches
5
+
6
+ def initialize url
7
+ @url = Scrape::URI.new url
8
+ @url.query = nil
9
+ @url.fragment = nil
10
+ @matches = []
11
+ end
12
+
13
+ def add_match matcher, &proc
14
+ match = Scrape::Match.new(matcher, &proc)
15
+ @matches << match
16
+ match
17
+ end
18
+
19
+ def parse url
20
+ url = self.url + url
21
+ doc = Nokogiri::HTML url.open
22
+
23
+ @matches.each{|match| match.invoke doc if match =~ url }
24
+
25
+ urls = doc.css("a[href]").map do |node|
26
+ href = self.url + node['href']
27
+ self.url < href ? href : nil
28
+ end
29
+ urls.compact
30
+ end
31
+ end
data/lib/scrape/uri.rb ADDED
@@ -0,0 +1,58 @@
1
+ require 'uri'
2
+ require 'open-uri'
3
+
4
+ class Scrape::URI
5
+ def initialize uri = nil
6
+ @uri = case uri
7
+ when URI then uri.clone
8
+ when NilClass then URI.new
9
+ else URI.parse uri.to_s
10
+ end
11
+ end
12
+
13
+ %w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
14
+ class_eval <<-EOT, __FILE__, __LINE__ + 1
15
+ def #{method_name}
16
+ @uri.#{method_name}
17
+ end
18
+ EOT
19
+ end
20
+
21
+ %w[fragment host hostname password path port query scheme user].each do |method_name|
22
+ class_eval <<-EOT, __FILE__, __LINE__ + 1
23
+ def #{method_name}= value
24
+ @uri.#{method_name} = value
25
+ end
26
+ EOT
27
+ end
28
+
29
+ def + url
30
+ return clone if self == url
31
+ relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
32
+ uri = self.class.new @uri.merge(url)
33
+ uri.path = "#{@uri.path}#{uri.path}" if relative
34
+ uri
35
+ end
36
+
37
+ def < url
38
+ url[0, length] == to_s
39
+ end
40
+
41
+ def [] *args
42
+ to_s[*args]
43
+ end
44
+
45
+ def == url
46
+ to_s == url.to_s
47
+ end
48
+
49
+ def length
50
+ to_s.length
51
+ end
52
+ alias_method :size, :length
53
+
54
+ def open headers = {}, &block
55
+ headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
56
+ super(to_s, headers, &block).read
57
+ end
58
+ end
@@ -0,0 +1,3 @@
1
+ module Scrape
2
+ VERSION = '0.1' unless defined? ::Scrape::VERSION
3
+ end
data/lib/scrape.rb ADDED
@@ -0,0 +1,36 @@
1
+ require "rubygems"
2
+ require "logger"
3
+ require "bundler/setup"
4
+
5
+ module Scrape
6
+ require 'scrape/version'
7
+
8
+ autoload 'Application', 'scrape/application'
9
+ autoload 'Site', 'scrape/site'
10
+ autoload 'Match', 'scrape/match'
11
+ autoload 'DefaultLoader', 'scrape/default_loader'
12
+ autoload 'DSL', 'scrape/dsl'
13
+ autoload 'URI', 'scrape/uri'
14
+
15
+ class ScrapeFileNotFound < Exception; end
16
+
17
+ class << self
18
+ attr_writer :user_agent
19
+
20
+ def user_agent
21
+ @user_agent || "Scrape/#{Scrape::VERSION}"
22
+ end
23
+
24
+ def logger
25
+ @logger ||= Logger.new STDOUT
26
+ end
27
+
28
+ def logger= log
29
+ @logger = log
30
+ end
31
+
32
+ def load_scrapefile path
33
+ Application.new path
34
+ end
35
+ end
36
+ end
data/scrape.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/scrape/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "scrape"
6
+ s.version = Scrape::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Marty Zalega"]
9
+ s.email = ["evilmarty@gmail.com"]
10
+ s.homepage = "http://github.com/evilmarty/scrape"
11
+ s.summary = %q{A really simple web scraper}
12
+ s.description = %q{An easy to use utility to scrape websites using a DSL similar to rake.}
13
+
14
+ s.rubyforge_project = "scrape"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "nokogiri", "~> 1.5.5"
22
+ end
@@ -0,0 +1,5 @@
1
+ site "http://example.com"
2
+
3
+ match "test" do |doc|
4
+ "boo!"
5
+ end
@@ -0,0 +1 @@
1
+ site "http://example.com"
@@ -0,0 +1,3 @@
1
+ match "test" do |doc|
2
+ "boo!"
3
+ end
@@ -0,0 +1,19 @@
1
+ $: << File.expand_path('../../lib', __FILE__)
2
+
3
+ require "minitest/autorun"
4
+ require "webmock/minitest"
5
+
6
+ require "bundler/setup"
7
+ Bundler.setup(:default, :test)
8
+
9
+ require "scrape"
10
+
11
+ class Scrape::TestCase < MiniTest::Unit::TestCase
12
+ class << self
13
+ def test name, &block
14
+ method_name = name.gsub /[^a-z0-9_]+/i, '_'
15
+ define_method "test_#{method_name}", &block
16
+ end
17
+ private :test
18
+ end
19
+ end
@@ -0,0 +1,72 @@
1
+ require "test_helper"
2
+
3
+ class ApplicationTest < Scrape::TestCase
4
+ SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
5
+
6
+ test "#load_scrapefile should parse the specified file" do
7
+ filepath = File.join(SUPPORT_FILES, 'test1.scrape')
8
+ app = Scrape::Application.new(filepath)
9
+ assert app.load_scrapefile, "scrape file failed to load"
10
+ assert_equal ["http://example.com"], app.sites.keys
11
+ end
12
+
13
+ test "#load_scrapefile should return nil when already loaded" do
14
+ filepath = File.join(SUPPORT_FILES, 'test1.scrape')
15
+ app = Scrape::Application.new(filepath)
16
+ assert app.load_scrapefile, "scrape file failed to load"
17
+ refute app.load_scrapefile, "scrape file should not have loaded again"
18
+ end
19
+
20
+ test "#[] should return the site that matches the given url" do
21
+ site1 = Scrape::Site.new "http://example.com"
22
+ site2 = Scrape::Site.new "http://example.org"
23
+ app = Scrape::Application.new(".")
24
+ app.sites.update site1.to_s => site1, site2.to_s => site2
25
+ assert_equal site1, app["http://example.com"]
26
+ end
27
+
28
+ test "#[] should return the site that is relative to the given url" do
29
+ site1 = Scrape::Site.new "http://example.com"
30
+ site2 = Scrape::Site.new "http://example.org"
31
+ app = Scrape::Application.new(".")
32
+ app.sites.update site1.to_s => site1, site2.to_s => site2
33
+ assert_equal site1, app["http://example.com/test"]
34
+ end
35
+
36
+ test "#[] should return nil when no site matches the given url" do
37
+ site1 = Scrape::Site.new "http://example.com"
38
+ site2 = Scrape::Site.new "http://example.org"
39
+ app = Scrape::Application.new(".")
40
+ app.sites.update site1.to_s => site1, site2.to_s => site2
41
+ assert_nil app["http://example.net"]
42
+ end
43
+
44
+ test "#reset should enqueue the sites that have been defined" do
45
+ site1 = Scrape::Site.new "http://example.com"
46
+ site2 = Scrape::Site.new "http://example.org"
47
+ app = Scrape::Application.new(".")
48
+ app.sites.update site1.to_s => site1, site2.to_s => site2
49
+ app.reset
50
+ assert_equal ["http://example.com", "http://example.org"], app.queue
51
+ end
52
+
53
+ test "#run should load the specified file" do
54
+ filepath = File.join(SUPPORT_FILES, 'test1.scrape')
55
+ test_loader = MiniTest::Mock.new
56
+ test_loader.expect :load, nil, [filepath]
57
+ Scrape::Application.new(filepath, test_loader).run
58
+ assert test_loader.verify, "loader did not receive file"
59
+ end
60
+
61
+ test "#enqueue should add the given url to the queue" do
62
+ app = Scrape::Application.new(".")
63
+ app.enqueue "http://example.com"
64
+ assert_equal ["http://example.com"], app.queue
65
+ end
66
+
67
+ test "#enqueue should not add the given to the queue when it already is added" do
68
+ app = Scrape::Application.new(".")
69
+ 3.times{ app.enqueue "http://example.com" }
70
+ assert_equal ["http://example.com"], app.queue
71
+ end
72
+ end
@@ -0,0 +1,25 @@
1
+ require "test_helper"
2
+
3
+ class DefaultLoaderTest < Scrape::TestCase
4
+ SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
5
+
6
+ test "#load should return sites parsed from the specified file" do
7
+ loader = Scrape::DefaultLoader.new
8
+ sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
9
+ assert_equal ["http://example.com"], sites.keys
10
+ assert_instance_of Scrape::Site, sites.values[0]
11
+ end
12
+
13
+ test "#load should return an empty hash when no matches have been defined" do
14
+ loader = Scrape::DefaultLoader.new
15
+ sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
16
+ assert_equal Hash.new, sites
17
+ end
18
+
19
+ test "#load should raise an error when no site is defined" do
20
+ loader = Scrape::DefaultLoader.new
21
+ assert_raises ArgumentError do
22
+ loader.load File.join(SUPPORT_FILES, "test3.scrape")
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,54 @@
1
+ require "test_helper"
2
+
3
+ class MatchTest < Scrape::TestCase
4
+ test "#initialize should raise error when proc's arity isn't one" do
5
+ assert_raises ArgumentError do
6
+ Scrape::Match.new("test"){ "no arguments" }
7
+ end
8
+ end
9
+
10
+ test "#invoke should call the proc" do
11
+ ok = false
12
+ match = Scrape::Match.new("test"){|doc| ok = true }
13
+ match.invoke nil
14
+ assert ok, "Proc was not called"
15
+ end
16
+
17
+ test "#invoke should pass the document to the proc" do
18
+ doc = "yay"
19
+ ok = false
20
+ match = Scrape::Match.new("test"){|d| ok = (doc == d) }
21
+ match.invoke doc
22
+ assert ok, "Document was not passed into the proc"
23
+ end
24
+
25
+ test "#=~ should return true when contains string" do
26
+ match = Scrape::Match.new("bar"){|doc|}
27
+ assert match =~ "foobar", "Expected true"
28
+ end
29
+
30
+ test "#=~ should return false when doesn't contains string" do
31
+ match = Scrape::Match.new("bar"){|doc|}
32
+ refute match =~ "ponies", "Expected false"
33
+ end
34
+
35
+ test "#=~ should return true when matches regexp" do
36
+ match = Scrape::Match.new(/bar/){|doc|}
37
+ assert match =~ "foobar", "Expected true"
38
+ end
39
+
40
+ test "#=~ should return false when doesn't match regexp" do
41
+ match = Scrape::Match.new(/bar/){|doc|}
42
+ refute match =~ "ponies", "Expected false"
43
+ end
44
+
45
+ test "#=~ should return true when proc is truthy" do
46
+ match = Scrape::Match.new(lambda{|url| true }){|doc|}
47
+ assert match =~ "ponies", "Expected true"
48
+ end
49
+
50
+ test "#=~ should return false when proc is falsy" do
51
+ match = Scrape::Match.new(lambda{|url| false }){|doc|}
52
+ refute match =~ "ponies", "Expected false"
53
+ end
54
+ end
@@ -0,0 +1,75 @@
1
+ require "test_helper"
2
+
3
+ class SiteTest < Scrape::TestCase
4
+ test "#add_match should create a Match object and be added to the collection" do
5
+ site = Scrape::Site.new "http://www.example.com"
6
+ match = site.add_match("/test") { |doc| }
7
+ assert_instance_of Scrape::Match, match
8
+ end
9
+
10
+ test "#parse should return absolute urls that match the site's url" do
11
+ stub_request(:get, "http://www.example.com/test").
12
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
13
+ to_return(:status => 200, :body => <<-HTML)
14
+ <html>
15
+ <body>
16
+ <a href="http://www.example.com/link1.html">link 1</a>
17
+ <a href="http://example.com/link2.html">link 2</a>
18
+ <a href="http://example.org/link3.html">link 3</a>
19
+ </body>
20
+ </html>
21
+ HTML
22
+
23
+ site = Scrape::Site.new "http://www.example.com"
24
+ assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
25
+ end
26
+
27
+ test "#parse should return relative urls to the site" do
28
+ stub_request(:get, "http://www.example.com/test").
29
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
30
+ to_return(:status => 200, :body => <<-HTML)
31
+ <html>
32
+ <body>
33
+ <a href="link1.html">link 1</a>
34
+ </body>
35
+ </html>
36
+ HTML
37
+
38
+ site = Scrape::Site.new "http://www.example.com"
39
+ assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
40
+ end
41
+
42
+ test "#parse should return no urls" do
43
+ stub_request(:get, "http://www.example.com/test").
44
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
45
+ to_return(:status => 200, :body => <<-HTML)
46
+ <html>
47
+ <body>
48
+ <a href="/link1.html">link 1</a>
49
+ </body>
50
+ </html>
51
+ HTML
52
+
53
+ site = Scrape::Site.new "http://www.example.com/test"
54
+ assert_equal [], site.parse("/test")
55
+ end
56
+
57
+ test "#parse should invoke Match when hit" do
58
+ stub_request(:get, "http://www.example.com/test").
59
+ with(:headers => {"User-Agent" => Scrape.user_agent}).
60
+ to_return(:status => 200, :body => <<-HTML)
61
+ <html>
62
+ <body>
63
+ <a href="link1.html">link 1</a>
64
+ </body>
65
+ </html>
66
+ HTML
67
+
68
+ ok = false
69
+ site = Scrape::Site.new "http://www.example.com"
70
+ site.add_match(/test/){|doc| ok = true }
71
+ site.parse "/test"
72
+
73
+ assert ok, "Match was not invoked"
74
+ end
75
+ end
@@ -0,0 +1,52 @@
1
+ require "test_helper"
2
+
3
+ class URITest < Scrape::TestCase
4
+ {fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
5
+ test "##{method_name} should return value" do
6
+ uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
7
+ assert_equal value, uri.send(method_name)
8
+ end
9
+ end
10
+
11
+ test "#open should return the contents at the url" do
12
+ stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
13
+
14
+ uri = Scrape::URI.new "http://www.example.com"
15
+ assert_equal "Howdie", uri.open
16
+ end
17
+
18
+ test "#+ should return a URI with the specified path" do
19
+ uri1 = Scrape::URI.new "http://www.example.com"
20
+ uri2 = uri1 + "/bar"
21
+ assert_equal "http://www.example.com/bar", uri2.to_s
22
+ end
23
+
24
+ test "#+ should return a URI overwriting with the specified path" do
25
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
26
+ uri2 = uri1 + "/bar"
27
+ assert_equal "http://www.example.com/bar", uri2.to_s
28
+ end
29
+
30
+ test "#+ should return a URI with the specified path appended" do
31
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
32
+ uri2 = uri1 + "bar"
33
+ assert_equal "http://www.example.com/foo/bar", uri2.to_s
34
+ end
35
+
36
+ test "#+ should return a URI from the absolute url" do
37
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
38
+ uri2 = uri1 + "http://www.example.com/bar"
39
+ assert_equal "http://www.example.com/bar", uri2.to_s
40
+ end
41
+
42
+ test "#+ should return a URI appended from the absolute url" do
43
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
44
+ uri2 = uri1 + "http://www.example.com/foo/bar"
45
+ assert_equal "http://www.example.com/foo/bar", uri2.to_s
46
+ end
47
+
48
+ test "#< should return true when specified url is greater" do
49
+ uri1 = Scrape::URI.new "http://www.example.com/foo"
50
+ assert uri1 < "http://www.example.com/foo/bar"
51
+ end
52
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrape
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Marty Zalega
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.5
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.5
30
+ description: An easy to use utility to scrape websites using a DSL similar to rake.
31
+ email:
32
+ - evilmarty@gmail.com
33
+ executables:
34
+ - scrape
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - .gitignore
39
+ - Gemfile
40
+ - Gemfile.lock
41
+ - LICENSE
42
+ - README.md
43
+ - Rakefile
44
+ - bin/scrape
45
+ - examples/github.scrape
46
+ - examples/google.scrape
47
+ - lib/scrape.rb
48
+ - lib/scrape/application.rb
49
+ - lib/scrape/cli.rb
50
+ - lib/scrape/default_loader.rb
51
+ - lib/scrape/dsl.rb
52
+ - lib/scrape/match.rb
53
+ - lib/scrape/site.rb
54
+ - lib/scrape/uri.rb
55
+ - lib/scrape/version.rb
56
+ - scrape.gemspec
57
+ - test/support/test1.scrape
58
+ - test/support/test2.scrape
59
+ - test/support/test3.scrape
60
+ - test/test_helper.rb
61
+ - test/unit/application_test.rb
62
+ - test/unit/default_loader_test.rb
63
+ - test/unit/match_test.rb
64
+ - test/unit/site_test.rb
65
+ - test/unit/uri_test.rb
66
+ homepage: http://github.com/evilmarty/scrape
67
+ licenses: []
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ! '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project: scrape
86
+ rubygems_version: 1.8.23
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: A really simple web scraper
90
+ test_files: []