webscour 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +18 -0
- data/lib/webscour.rb +30 -0
- data/lib/webscour/version.rb +3 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/webscour_spec.rb +53 -0
- metadata +10 -5
data/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
WebScour
|
2
|
+
=======
|
3
|
+
|
4
|
+
*WebScour* is a little DSL web scraping tool.
|
5
|
+
|
6
|
+
require 'webscour'
|
7
|
+
|
8
|
+
blog = WebScour.parse('http://example.com') do
|
9
|
+
element :title
|
10
|
+
end
|
11
|
+
|
12
|
+
blog.title
|
13
|
+
#=> "Hello World"
|
14
|
+
|
15
|
+
|
16
|
+
[See Nibbler][nibbler] for more `element` examples.
|
17
|
+
|
18
|
+
[nibbler]: http://github.com/mislav/nibbler
|
data/lib/webscour.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nibbler'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'faraday'
|
5
|
+
|
6
|
+
class WebScour
|
7
|
+
attr_accessor :uri, :document, :klass
|
8
|
+
def self.parse(uri, &block)
|
9
|
+
new(uri, &block)
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(uri, &block)
|
13
|
+
@document, @uri = fetch uri
|
14
|
+
@klass = Class.new(Nibbler) { instance_eval(&block) }.parse(@document.body)
|
15
|
+
end
|
16
|
+
|
17
|
+
def method_missing(m)
|
18
|
+
@klass.send(:"#{m}")
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def fetch(uri)
|
23
|
+
response = Faraday.get uri
|
24
|
+
if [301,302].include?(response.status)
|
25
|
+
uri = response.headers['location']
|
26
|
+
response = fetch(uri).first
|
27
|
+
end
|
28
|
+
[response, uri]
|
29
|
+
end
|
30
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/spec_helper"
|
2
|
+
|
3
|
+
describe "WebScour" do
|
4
|
+
before do
|
5
|
+
@data = <<-HTML
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>Hello World</title>
|
9
|
+
</head>
|
10
|
+
<body>
|
11
|
+
<p>This is testing.</p>
|
12
|
+
|
13
|
+
<ul>
|
14
|
+
<li>A</li>
|
15
|
+
<li>B</li>
|
16
|
+
<li>C</li>
|
17
|
+
</ul>
|
18
|
+
</body>
|
19
|
+
</html>
|
20
|
+
HTML
|
21
|
+
FakeWeb.register_uri(:any, "http://www.example.com", :body => @data)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should have data" do
|
25
|
+
@data.should_not be_nil
|
26
|
+
@data.should_not be_empty
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should have content" do
|
30
|
+
resp = Faraday.get "http://www.example.com/"
|
31
|
+
resp.body.should_not be_empty
|
32
|
+
resp.body.should == @data
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should have title" do
|
36
|
+
html = WebScour.parse("http://www.example.com/") do
|
37
|
+
|
38
|
+
element :title
|
39
|
+
|
40
|
+
element './/body/p' => :body
|
41
|
+
|
42
|
+
elements :ul => :list do
|
43
|
+
element:li => :text
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
html.title.should == "Hello World"
|
49
|
+
html.body.should =~ /This is testing\./
|
50
|
+
html.list.should be_kind_of(Array)
|
51
|
+
html.list.first.text.should == "A"
|
52
|
+
end
|
53
|
+
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webscour
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Bryan Goines
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-06 00:00:00 -05:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -123,8 +124,12 @@ extensions: []
|
|
123
124
|
|
124
125
|
extra_rdoc_files: []
|
125
126
|
|
126
|
-
files:
|
127
|
-
|
127
|
+
files:
|
128
|
+
- README.md
|
129
|
+
- lib/webscour.rb
|
130
|
+
- lib/webscour/version.rb
|
131
|
+
- spec/spec_helper.rb
|
132
|
+
- spec/webscour_spec.rb
|
128
133
|
has_rdoc: true
|
129
134
|
homepage: http://rubygems.org/gems/webscour
|
130
135
|
licenses: []
|