webscour 0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +18 -0
- data/lib/webscour.rb +30 -0
- data/lib/webscour/version.rb +3 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/webscour_spec.rb +53 -0
- metadata +10 -5
data/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
WebScour
|
2
|
+
=======
|
3
|
+
|
4
|
+
*WebScour* is a little DSL web scraping tool.
|
5
|
+
|
6
|
+
require 'webscour'
|
7
|
+
|
8
|
+
blog = WebScour.parse('http://example.com') do
|
9
|
+
element :title
|
10
|
+
end
|
11
|
+
|
12
|
+
blog.title
|
13
|
+
#=> "Hello World"
|
14
|
+
|
15
|
+
|
16
|
+
[See Nibbler][nibbler] for more `element` examples.
|
17
|
+
|
18
|
+
[nibbler]: http://github.com/mislav/nibbler
|
data/lib/webscour.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nibbler'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'faraday'
|
5
|
+
|
6
|
+
class WebScour
|
7
|
+
attr_accessor :uri, :document, :klass
|
8
|
+
def self.parse(uri, &block)
|
9
|
+
new(uri, &block)
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(uri, &block)
|
13
|
+
@document, @uri = fetch uri
|
14
|
+
@klass = Class.new(Nibbler) { instance_eval(&block) }.parse(@document.body)
|
15
|
+
end
|
16
|
+
|
17
|
+
def method_missing(m)
|
18
|
+
@klass.send(:"#{m}")
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def fetch(uri)
|
23
|
+
response = Faraday.get uri
|
24
|
+
if [301,302].include?(response.status)
|
25
|
+
uri = response.headers['location']
|
26
|
+
response = fetch(uri).first
|
27
|
+
end
|
28
|
+
[response, uri]
|
29
|
+
end
|
30
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/spec_helper"
|
2
|
+
|
3
|
+
describe "WebScour" do
|
4
|
+
before do
|
5
|
+
@data = <<-HTML
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>Hello World</title>
|
9
|
+
</head>
|
10
|
+
<body>
|
11
|
+
<p>This is testing.</p>
|
12
|
+
|
13
|
+
<ul>
|
14
|
+
<li>A</li>
|
15
|
+
<li>B</li>
|
16
|
+
<li>C</li>
|
17
|
+
</ul>
|
18
|
+
</body>
|
19
|
+
</html>
|
20
|
+
HTML
|
21
|
+
FakeWeb.register_uri(:any, "http://www.example.com", :body => @data)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should have data" do
|
25
|
+
@data.should_not be_nil
|
26
|
+
@data.should_not be_empty
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should have content" do
|
30
|
+
resp = Faraday.get "http://www.example.com/"
|
31
|
+
resp.body.should_not be_empty
|
32
|
+
resp.body.should == @data
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should have title" do
|
36
|
+
html = WebScour.parse("http://www.example.com/") do
|
37
|
+
|
38
|
+
element :title
|
39
|
+
|
40
|
+
element './/body/p' => :body
|
41
|
+
|
42
|
+
elements :ul => :list do
|
43
|
+
element:li => :text
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
html.title.should == "Hello World"
|
49
|
+
html.body.should =~ /This is testing\./
|
50
|
+
html.list.should be_kind_of(Array)
|
51
|
+
html.list.first.text.should == "A"
|
52
|
+
end
|
53
|
+
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webscour
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Bryan Goines
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-06 00:00:00 -05:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -123,8 +124,12 @@ extensions: []
|
|
123
124
|
|
124
125
|
extra_rdoc_files: []
|
125
126
|
|
126
|
-
files:
|
127
|
-
|
127
|
+
files:
|
128
|
+
- README.md
|
129
|
+
- lib/webscour.rb
|
130
|
+
- lib/webscour/version.rb
|
131
|
+
- spec/spec_helper.rb
|
132
|
+
- spec/webscour_spec.rb
|
128
133
|
has_rdoc: true
|
129
134
|
homepage: http://rubygems.org/gems/webscour
|
130
135
|
licenses: []
|