skrape 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +10 -4
- data/lib/skrape.rb +8 -1
- data/lib/skrape/version.rb +1 -1
- data/spec/skrape_spec.rb +11 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0e1a84190cf280563a99b9c5a8e6ae0e3bef7e8
|
4
|
+
data.tar.gz: 7c1b968d7feedd6cb146cddac6e4e07c9b0e3b3a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f833b7fe7873798b691fb97ef18013b60535fd45074a056494d0267e25fa348f09ac9fd94ee46c31fd81e2978a97679f1d865a2cfdf2c227f9146a67d2174a4e
|
7
|
+
data.tar.gz: 473ae1b801b5b8a5b7857d9add162422d7c0702b5d4260dc0103d15360aad98dae68bd62044377668e151de46ad2517b9353fea50d7670d405b5c4e78e77126f
|
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Skrape
|
2
2
|
|
3
|
-
|
3
|
+
Skrape provides a cute DSL for extracting information from pages on the
|
4
|
+
web. You give it a url and a block and it gives you back a hash.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
@@ -18,9 +19,6 @@ Or install it yourself as:
|
|
18
19
|
|
19
20
|
## Usage
|
20
21
|
|
21
|
-
Skrape provides a cute DSL for extracting information from pages on the
|
22
|
-
web. You give it a url and a block and it gives you back a hash.
|
23
|
-
|
24
22
|
Lets say you have a page like this:
|
25
23
|
|
26
24
|
<html><body><h1>I am a title</h1></body></html>
|
@@ -47,6 +45,14 @@ The element(s) will be passed into the block as a
|
|
47
45
|
Nokogiri::XML::NodeSet for you to play with. Whatever text you return
|
48
46
|
will be added to the hash of things to return.
|
49
47
|
|
48
|
+
For those moments when you want an error raised when a selector returns
|
49
|
+
nothing you can add:
|
50
|
+
|
51
|
+
results = Skrape::Page.new(url).extract do
|
52
|
+
error_when_selector_returns_nothing true
|
53
|
+
extract_link_href with: 'a', and_run: proc {|link| link.attr('href').value }
|
54
|
+
end
|
55
|
+
|
50
56
|
|
51
57
|
## Contributing
|
52
58
|
|
data/lib/skrape.rb
CHANGED
@@ -10,6 +10,7 @@ module Skrape
|
|
10
10
|
class Page
|
11
11
|
|
12
12
|
def initialize url
|
13
|
+
@fail_loudly = false
|
13
14
|
@extracted_info = {}
|
14
15
|
agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36"
|
15
16
|
@document = Nokogiri::HTML(open(url, "User-Agent" => agent))
|
@@ -21,10 +22,16 @@ module Skrape
|
|
21
22
|
@extracted_info
|
22
23
|
end
|
23
24
|
|
25
|
+
def error_when_selector_returns_nothing value
|
26
|
+
@fail_loudly = value
|
27
|
+
end
|
28
|
+
|
24
29
|
def method_missing name, args
|
25
30
|
feature_name = name.to_s.gsub('extract_', '').to_sym
|
26
31
|
element = @document.css args[:with]
|
27
|
-
|
32
|
+
if @fail_loudly
|
33
|
+
raise NoElementsFoundError, "the css selector for '#{feature_name}' did not return anything" if element.empty?
|
34
|
+
end
|
28
35
|
if args[:and_run]
|
29
36
|
@extracted_info[feature_name] = args[:and_run].call(element)
|
30
37
|
else
|
data/lib/skrape/version.rb
CHANGED
data/spec/skrape_spec.rb
CHANGED
@@ -25,12 +25,21 @@ describe Skrape do
|
|
25
25
|
expect(results[:link_href]).to eq "http://www.iana.org/domains/example"
|
26
26
|
end
|
27
27
|
|
28
|
-
it "raises
|
28
|
+
it "does not raises an error when a selector returns nothing" do
|
29
29
|
expect{
|
30
30
|
Skrape::Page.new(url).extract do
|
31
31
|
extract_nothing with: 'foo'
|
32
32
|
end
|
33
|
-
}.
|
33
|
+
}.not_to raise_error
|
34
|
+
end
|
35
|
+
|
36
|
+
it "raises a error when when told to" do
|
37
|
+
expect{
|
38
|
+
Skrape::Page.new(url).extract do
|
39
|
+
error_when_selector_returns_nothing true
|
40
|
+
extract_nothing with: 'foo'
|
41
|
+
end
|
42
|
+
}.to raise_error
|
34
43
|
end
|
35
44
|
|
36
45
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: skrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mike Williamson
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-03-07 00:00:00 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|