horsefield 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/horsefield/node.rb +5 -1
- data/lib/horsefield/node_set.rb +5 -5
- data/lib/horsefield/version.rb +1 -1
- data/spec/scraper_spec.rb +12 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b8f8255739340596e630908fdf232ad5cf3ac97
|
4
|
+
data.tar.gz: 7f5e63af192760467df38b2788efadee28926da4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b493efb36209a2b52527ffb5291314f4b3eeb1a2e80588f5f20a5bd627aa91456acf51d7d4c088a1ef67806467ff4c86099e6ec8f489c9571df605bea60f47e9
|
7
|
+
data.tar.gz: bed0f9129ef6e737577644709d6d8b2a8e33db76a1b5ccb6e3f0adbbd69d5be3f76f0a4cd78675c8d6d5ab195d72afbe98b91bd6ba86b5b6275db723d35c0197
|
data/lib/horsefield/node.rb
CHANGED
@@ -8,7 +8,11 @@ module Horsefield
|
|
8
8
|
def process
|
9
9
|
case @html
|
10
10
|
when Nokogiri::XML::Element then
|
11
|
-
@
|
11
|
+
if @type == :text
|
12
|
+
@html.search('text()').to_s.split.join ' '
|
13
|
+
elsif @type == :html
|
14
|
+
@html.to_s
|
15
|
+
end
|
12
16
|
when Nokogiri::XML::Attr then
|
13
17
|
@html.value
|
14
18
|
end
|
data/lib/horsefield/node_set.rb
CHANGED
@@ -16,15 +16,15 @@ module Horsefield
|
|
16
16
|
def browse(*)
|
17
17
|
end
|
18
18
|
|
19
|
-
def one(name, selector, &block)
|
20
|
-
@nodes[name] = dig_deeper selector, &block
|
19
|
+
def one(name, selector, type = :text, &block)
|
20
|
+
@nodes[name] = dig_deeper selector, false, type, &block
|
21
21
|
end
|
22
22
|
|
23
23
|
def many(name, selector, &block)
|
24
24
|
@nodes[name] = dig_deeper selector, true, &block
|
25
25
|
end
|
26
26
|
|
27
|
-
def dig_deeper(selector, many = false, &block)
|
27
|
+
def dig_deeper(selector, many = false, type = :text, &block)
|
28
28
|
return nil if base_elements(selector).empty?
|
29
29
|
|
30
30
|
if block
|
@@ -38,10 +38,10 @@ module Horsefield
|
|
38
38
|
else
|
39
39
|
if many
|
40
40
|
base_elements(selector).map do |e|
|
41
|
-
Horsefield::Node.new(e).process(&block)
|
41
|
+
Horsefield::Node.new(e, type).process(&block)
|
42
42
|
end
|
43
43
|
else
|
44
|
-
Horsefield::Node.new(base_elements(selector).first).process(&block)
|
44
|
+
Horsefield::Node.new(base_elements(selector).first, type).process(&block)
|
45
45
|
end
|
46
46
|
end
|
47
47
|
end
|
data/lib/horsefield/version.rb
CHANGED
data/spec/scraper_spec.rb
CHANGED
@@ -31,6 +31,18 @@ describe Horsefield::Scraper do
|
|
31
31
|
result[:job][:missing].should be_nil
|
32
32
|
end
|
33
33
|
|
34
|
+
it 'can return HTML instead of text' do
|
35
|
+
result = Horsefield::Scraper.new.scrape html: @html do
|
36
|
+
one :job, '.listingsTable .odd, .listingsTable .even' do
|
37
|
+
one :title, '.jobTitleContainer', :html
|
38
|
+
one :company, '.companyContainer'
|
39
|
+
one :missing, '.doesNotExist'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
result[:job][:title].should match(/<div class=\"jobTitleContainer\">/)
|
44
|
+
end
|
45
|
+
|
34
46
|
it 'works with Watir' do
|
35
47
|
browser = Watir::Browser.new :phantomjs
|
36
48
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: horsefield
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erik Strömberg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -166,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
166
|
version: '0'
|
167
167
|
requirements: []
|
168
168
|
rubyforge_project:
|
169
|
-
rubygems_version: 2.
|
169
|
+
rubygems_version: 2.1.9
|
170
170
|
signing_key:
|
171
171
|
specification_version: 4
|
172
172
|
summary: It's a scraper
|