horsefield 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2cfb5049fcbda71817b68f0ba6359a6a2794afdc
4
- data.tar.gz: 2d3c26b593f8a05e6510522f805c3185815a59fc
3
+ metadata.gz: 8b8f8255739340596e630908fdf232ad5cf3ac97
4
+ data.tar.gz: 7f5e63af192760467df38b2788efadee28926da4
5
5
  SHA512:
6
- metadata.gz: d3edbb36a1e8fd05e3306e14f0f7ca54f9ab9d715afcb64c4440507a0808e721f5790343fb23d16f61d343f183cc37da06ea0bc62f04261fa7ee6bf680e76ffd
7
- data.tar.gz: e60f5aef998f9228cc3ddc493d3206c408b32fa3b7221199d6e2ad80bb0658bcb7145c09cbf0615a32f39e948548e2861f2efed95e7fc2e4da7f1f2633d9c84b
6
+ metadata.gz: b493efb36209a2b52527ffb5291314f4b3eeb1a2e80588f5f20a5bd627aa91456acf51d7d4c088a1ef67806467ff4c86099e6ec8f489c9571df605bea60f47e9
7
+ data.tar.gz: bed0f9129ef6e737577644709d6d8b2a8e33db76a1b5ccb6e3f0adbbd69d5be3f76f0a4cd78675c8d6d5ab195d72afbe98b91bd6ba86b5b6275db723d35c0197
@@ -8,7 +8,11 @@ module Horsefield
8
8
  def process
9
9
  case @html
10
10
  when Nokogiri::XML::Element then
11
- @html.search('text()').to_s.split.join ' '
11
+ if @type == :text
12
+ @html.search('text()').to_s.split.join ' '
13
+ elsif @type == :html
14
+ @html.to_s
15
+ end
12
16
  when Nokogiri::XML::Attr then
13
17
  @html.value
14
18
  end
@@ -16,15 +16,15 @@ module Horsefield
16
16
  def browse(*)
17
17
  end
18
18
 
19
- def one(name, selector, &block)
20
- @nodes[name] = dig_deeper selector, &block
19
+ def one(name, selector, type = :text, &block)
20
+ @nodes[name] = dig_deeper selector, false, type, &block
21
21
  end
22
22
 
23
23
  def many(name, selector, &block)
24
24
  @nodes[name] = dig_deeper selector, true, &block
25
25
  end
26
26
 
27
- def dig_deeper(selector, many = false, &block)
27
+ def dig_deeper(selector, many = false, type = :text, &block)
28
28
  return nil if base_elements(selector).empty?
29
29
 
30
30
  if block
@@ -38,10 +38,10 @@ module Horsefield
38
38
  else
39
39
  if many
40
40
  base_elements(selector).map do |e|
41
- Horsefield::Node.new(e).process(&block)
41
+ Horsefield::Node.new(e, type).process(&block)
42
42
  end
43
43
  else
44
- Horsefield::Node.new(base_elements(selector).first).process(&block)
44
+ Horsefield::Node.new(base_elements(selector).first, type).process(&block)
45
45
  end
46
46
  end
47
47
  end
@@ -1,3 +1,3 @@
1
1
  module Horsefield
2
- VERSION = "0.2.4"
2
+ VERSION = "0.2.5"
3
3
  end
data/spec/scraper_spec.rb CHANGED
@@ -31,6 +31,18 @@ describe Horsefield::Scraper do
31
31
  result[:job][:missing].should be_nil
32
32
  end
33
33
 
34
+ it 'can return HTML instead of text' do
35
+ result = Horsefield::Scraper.new.scrape html: @html do
36
+ one :job, '.listingsTable .odd, .listingsTable .even' do
37
+ one :title, '.jobTitleContainer', :html
38
+ one :company, '.companyContainer'
39
+ one :missing, '.doesNotExist'
40
+ end
41
+ end
42
+
43
+ result[:job][:title].should match(/<div class=\"jobTitleContainer\">/)
44
+ end
45
+
34
46
  it 'works with Watir' do
35
47
  browser = Watir::Browser.new :phantomjs
36
48
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: horsefield
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erik Strömberg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-03 00:00:00.000000000 Z
11
+ date: 2013-10-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -166,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
166
  version: '0'
167
167
  requirements: []
168
168
  rubyforge_project:
169
- rubygems_version: 2.0.0
169
+ rubygems_version: 2.1.9
170
170
  signing_key:
171
171
  specification_version: 4
172
172
  summary: It's a scraper