horsefield 0.3.14 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -23
- data/Rakefile +5 -4
- data/horsefield.gemspec +6 -10
- data/lib/horsefield.rb +5 -4
- data/lib/horsefield/diggable.rb +47 -0
- data/lib/horsefield/nokogiri.rb +11 -0
- data/lib/horsefield/scraper.rb +41 -32
- data/lib/horsefield/version.rb +1 -1
- data/test/horsefield/test_scraper.rb +56 -0
- data/test/recipe_source.html +2322 -0
- data/test/test_helper.rb +3 -0
- metadata +22 -66
- data/.rspec +0 -2
- data/lib/horsefield/node.rb +0 -21
- data/lib/horsefield/node_set.rb +0 -59
- data/spec/fixtures/monster.html +0 -2311
- data/spec/fixtures/vcr_cassettes/facebook/johnny_qiu1.yml +0 -7105
- data/spec/scraper_spec.rb +0 -70
- data/spec/spec_helper.rb +0 -29
data/spec/scraper_spec.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'horsefield/scraper'
|
3
|
-
require 'watir-webdriver'
|
4
|
-
|
5
|
-
describe Horsefield::Scraper do
|
6
|
-
describe 'with HTML' do
|
7
|
-
before do
|
8
|
-
@html = IO.read File.join(__dir__, 'fixtures/monster.html')
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'should scrape' do
|
12
|
-
result = Horsefield::Scraper.new.scrape html: @html do
|
13
|
-
many :jobs, '.listingsTable .odd, .listingsTable .even' do
|
14
|
-
one :title, '.jobTitleContainer'
|
15
|
-
one :company, '.companyContainer'
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
expect(result[:jobs].length).to eq(9)
|
20
|
-
end
|
21
|
-
|
22
|
-
it 'should be able to scrape in scope' do
|
23
|
-
result = Horsefield::Scraper.new.scrape html: @html do
|
24
|
-
scope '#primaryResults' do
|
25
|
-
many :jobs, '.listingsTable .odd, .listingsTable .even' do
|
26
|
-
one :title, '.jobTitleContainer'
|
27
|
-
one :company, '.companyContainer'
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
expect(result[:jobs].length).to eq(9)
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'should return nil for selectors that are not found' do
|
36
|
-
result = Horsefield::Scraper.new.scrape html: @html do
|
37
|
-
one :job, '.listingsTable .odd, .listingsTable .even' do
|
38
|
-
one :title, '.jobTitleContainer'
|
39
|
-
one :company, '.companyContainer'
|
40
|
-
one :missing, '.doesNotExist'
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
expect(result[:job][:missing]).to be_nil
|
45
|
-
end
|
46
|
-
|
47
|
-
it 'can return HTML instead of text' do
|
48
|
-
result = Horsefield::Scraper.new.scrape html: @html do
|
49
|
-
one :job, '.listingsTable .odd, .listingsTable .even' do
|
50
|
-
one :title, '.jobTitleContainer', :html
|
51
|
-
one :company, '.companyContainer'
|
52
|
-
one :missing, '.doesNotExist'
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
expect(result[:job][:title]).to match(/<a id=\"ctl00_ctl00_ctl00_body_body_wacCenterStage_ctl02_rptResults_ctl00_linkJobTitle\"/)
|
57
|
-
end
|
58
|
-
|
59
|
-
it 'works with Watir' do
|
60
|
-
browser = Watir::Browser.new :phantomjs
|
61
|
-
|
62
|
-
result = Horsefield::Scraper.new(browser).scrape 'https://github.com/cowboy' do
|
63
|
-
one :email, 'a.email'
|
64
|
-
many :organizations, '//div[@class="orgs"]//@original-title'
|
65
|
-
end
|
66
|
-
|
67
|
-
p result
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
data/spec/spec_helper.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
require 'pry'
|
2
|
-
#require 'vcr'
|
3
|
-
#require 'webmock'
|
4
|
-
|
5
|
-
#VCR.configure do |config|
|
6
|
-
# config.cassette_library_dir = 'spec/fixtures/vcr_cassettes'
|
7
|
-
# config.hook_into :webmock
|
8
|
-
# config.configure_rspec_metadata!
|
9
|
-
# config.allow_http_connections_when_no_cassette = true
|
10
|
-
#end
|
11
|
-
|
12
|
-
# This file was generated by the `rspec --init` command. Conventionally, all
|
13
|
-
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
14
|
-
# Require this file using `require "spec_helper"` to ensure that it is only
|
15
|
-
# loaded once.
|
16
|
-
#
|
17
|
-
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
18
|
-
RSpec.configure do |config|
|
19
|
-
config.treat_symbols_as_metadata_keys_with_true_values = true
|
20
|
-
config.run_all_when_everything_filtered = true
|
21
|
-
config.filter_run :focus
|
22
|
-
config.treat_symbols_as_metadata_keys_with_true_values = true
|
23
|
-
|
24
|
-
# Run specs in random order to surface order dependencies. If you find an
|
25
|
-
# order dependency and want to debug it, you can fix the order by providing
|
26
|
-
# the seed, which is printed after each run.
|
27
|
-
# --seed 1234
|
28
|
-
config.order = 'random'
|
29
|
-
end
|