korfzone-scraper 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +5 -0
- data/.simplecov +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +64 -0
- data/Rakefile +4 -0
- data/korfzone-scraper.gemspec +28 -0
- data/lib/korfzone/scraper/block_page.rb +39 -0
- data/lib/korfzone/scraper/categories.rb +26 -0
- data/lib/korfzone/scraper/games_page.rb +112 -0
- data/lib/korfzone/scraper/page.rb +47 -0
- data/lib/korfzone/scraper/version.rb +5 -0
- data/lib/korfzone/scraper.rb +38 -0
- data/spec/korfzone/scraper/page_spec.rb +224 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V01.html +1031 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011001.html +790 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011002.html +794 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011003.html +788 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011004.html +794 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011005.html +793 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011006.html +797 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld-V011007.html +793 -0
- data/spec/support/files/beta-Wedstrijden-senioren-veld.html +721 -0
- data/spec/support/files/games.yml +125 -0
- data/spec/support/games/V011001.yml +15 -0
- data/spec/support/games/V011002.yml +16 -0
- data/spec/support/games/V011003.yml +15 -0
- data/spec/support/games/V011004.yml +2 -0
- data/spec/support/games/V011005.yml +18 -0
- data/spec/support/games/V011006.yml +19 -0
- data/spec/support/games/V011007.yml +19 -0
- data/spec/support/games/V011008.yml +18 -0
- data/spec/support/open_mock.rb +16 -0
- metadata +172 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 92440e35b72111385ff8bd5a4ff54f5968593e43
|
4
|
+
data.tar.gz: 2349511ac04326bc50f420183b67ed2ed54a1e8e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 09532104cc8a7b680b7a56204eba9137501c541767e0bb7b9ee1466bb12d3c27194a91358a4e00540904075850075ccf9ca3a1932e7d70ebc71f406e2d20b978
|
7
|
+
data.tar.gz: 84cb5e0cc1f2bfd96dc84d0c973f87059229f58ed56bf811465fe19009d6babc8731bfa3b89572960b2aa12c9316e4e4942fc817cd504d8c77591d09075b657f
|
data/.gitignore
ADDED
data/.simplecov
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Rutger Claes
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Korfzone::Scraper
|
2
|
+
|
3
|
+
Using this scraper you can extract all information about korfball games from the korfbal.be website.
|
4
|
+
|
5
|
+
The game information extracted using this tool is not linked. For example, to list all games for one club you will have to map all team names to a club. The [Korfzone API](http://korfzone.be/api) offers a linked version of the data scraped from the KBKB website. It also hosts data from previous seasons.
|
6
|
+
|
7
|
+
This project is extracted from the Korfzone code base. The korfbal.be website uses very little to no semantic HTML. The scraper is therefore very brittle. If the developers decide to add a column to the games table or they change the class of the rows, this code will break.
|
8
|
+
|
9
|
+
Korfzone and the Korfzone Scraper are not affiliated with the KBKB in any way.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'korfzone-scraper'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install korfzone-scraper
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
The scraper has two main functions: Firstly finding the uri's to all pages containing the game information and secondly extracting that information from those pages. The korfbal.be website does not offer an easy to use single point of entry from which to scrape all uri's. The links to the relevant pages themselves are also scattered across a number of pages. Below are examples on how to find the relevant uri's and how to extract games from a single uri.
|
28
|
+
|
29
|
+
### Scraping the uri's to all pages for a certain category
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
entry_points = Korfzone::Scraper::Page.for_category :senioren
|
33
|
+
entry_points.each do |page|
|
34
|
+
puts page.block_uris.map { |uri| uri.to_s }.join( "\n" )
|
35
|
+
end
|
36
|
+
```
|
37
|
+
|
38
|
+
### Scraping the games of an individual page
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
page = Korfzone::Scraper::Page.new 'http://www.korfbal.be/beta/Wedstrijden/senioren/veld/V01'
|
42
|
+
page.games do |game|
|
43
|
+
puts game[ :starts_at ]
|
44
|
+
puts game[ :teams ].join( ' - ' )
|
45
|
+
puts game[ :location ]
|
46
|
+
puts "=" * 80
|
47
|
+
end
|
48
|
+
```
|
49
|
+
|
50
|
+
## ToDo list
|
51
|
+
|
52
|
+
1. Handle errors. The korfbal.be site occasionally returns status code 500. The scraper should handle this.
|
53
|
+
2. Check support for E-tags on the korfbal.be website. If they are supported, it would make sense to support them in the scraper as well.
|
54
|
+
3. Write a binary, probably based on Thor to facilitate scraping.
|
55
|
+
|
56
|
+
## Contributing
|
57
|
+
|
58
|
+
1. Fork it
|
59
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
60
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
61
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
62
|
+
5. Create new Pull Request
|
63
|
+
|
64
|
+
Consider also contributing to the overall [Korfzone project](http://github.org/korfzone).
|
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'korfzone/scraper/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "korfzone-scraper"
|
8
|
+
spec.version = Korfzone::Scraper::VERSION
|
9
|
+
spec.authors = ["Rutger Claes"]
|
10
|
+
spec.email = ["rutger@korfzone.be"]
|
11
|
+
|
12
|
+
spec.description = %q{Scrape the KBKB korfbal.be website}
|
13
|
+
spec.summary = %q{The code needed to scrape all games of the KBKB website (korfbal.be) extracted from the Korfzone project. This code is expected to break whenever the KBKB updates its website.}
|
14
|
+
|
15
|
+
spec.homepage = "http://github.org/korfzone/korfzone-scraper"
|
16
|
+
spec.license = "MIT"
|
17
|
+
|
18
|
+
spec.files = `git ls-files`.split($/)
|
19
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
|
+
spec.test_files = spec.files.grep(%r{^spec/})
|
21
|
+
spec.require_paths = ["lib"]
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "debugger"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
spec.add_development_dependency "rspec"
|
27
|
+
spec.add_development_dependency "simplecov"
|
28
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Korfzone
|
2
|
+
|
3
|
+
module Scraper
|
4
|
+
|
5
|
+
# Code to handle links to different block pages.
|
6
|
+
#
|
7
|
+
# Games on the {korfbal.be} website are divided into blocks. Every block
|
8
|
+
# has its own page. At the top of every page links to every other block
|
9
|
+
# can be found.
|
10
|
+
module BlockPage
|
11
|
+
|
12
|
+
# Extracts the uri's of all block pages found at the top of this page.
|
13
|
+
#
|
14
|
+
# @return [Array<URI>] An array containing all uri's found on this page
|
15
|
+
# @yieldparam [URI] An uri found on this page
|
16
|
+
def block_uris
|
17
|
+
Array.new.tap do |block_urls|
|
18
|
+
document.css( '.block @href' ).each do |block_element|
|
19
|
+
block_url = resolve_uri block_element.value
|
20
|
+
yield block_url if block_given?
|
21
|
+
block_urls << block_url
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Extracts the uri's of all block pages found at the top of this page
|
27
|
+
# encapsulated in {Korfzone::Scraper::Page} objects.
|
28
|
+
#
|
29
|
+
# @return [Array<Page>] An array containing other block pages
|
30
|
+
# @yieldparam [Page] A page containing games for one block
|
31
|
+
def block_pages
|
32
|
+
block_uris.map do |uri|
|
33
|
+
Page.new( uri.to_s ).tap { |uri| yield uri if block_given? }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Korfzone
|
2
|
+
|
3
|
+
module Scraper
|
4
|
+
|
5
|
+
# Starting uri's for all of the competitions
|
6
|
+
CATEGORIES = {
|
7
|
+
senioren: {
|
8
|
+
zaal: 'http://korfbal.be/beta/Wedstrijden/senioren/zaal',
|
9
|
+
veld: 'http://korfbal.be/beta/Wedstrijden/senioren/veld',
|
10
|
+
beker: 'http://korfbal.be/beta/Wedstrijden/senioren/beker senioren'
|
11
|
+
},
|
12
|
+
jeugd: {
|
13
|
+
veldvoorronde: 'http://korfbal.be/beta/Wedstrijden/jeugd/veldvoorronde',
|
14
|
+
zaal: 'http://korfbal.be/beta/Wedstrijden/jeugd/zaal',
|
15
|
+
veldeindronde: 'http://korfbal.be/beta/Wedstrijden/jeugd/veldeindronde',
|
16
|
+
beker: 'http://korfbal.be/beta/Wedstrijden/jeugd/beker'
|
17
|
+
},
|
18
|
+
gewestelijken: {
|
19
|
+
veld: 'http://korfbal.be/beta/Wedstrijden/gewestelijken/veld',
|
20
|
+
zaal: 'http://korfbal.be/beta/Wedstrijden/gewestelijken/zaal'
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module Korfzone
|
2
|
+
|
3
|
+
module Scraper
|
4
|
+
|
5
|
+
# Code to extract all games from one page
|
6
|
+
module GamesPage
|
7
|
+
|
8
|
+
def category
|
9
|
+
extract_category_and_competition if @category.nil?
|
10
|
+
@category
|
11
|
+
end
|
12
|
+
|
13
|
+
def competition
|
14
|
+
extract_category_and_competition if @competition.nil?
|
15
|
+
@competition
|
16
|
+
end
|
17
|
+
|
18
|
+
def games
|
19
|
+
Array.new.tap do |games|
|
20
|
+
rows = document.xpath( '//*[@id="ContentPlaceHolder1_UpdatePanel1"]/table[2]/tr[td[ contains( @class, "gameLeagueHeader" ) or contains( @class, "rptItem" ) or contains( @class, "rptAltItem" )]]' )
|
21
|
+
rows.inject( nil ) do |division,row|
|
22
|
+
if division.nil? || row.css( 'td.gameLeagueHeader' ).any?
|
23
|
+
row.text.strip.gsub( /\s+/, ' ' )
|
24
|
+
else
|
25
|
+
division.tap do |division|
|
26
|
+
attributes = parse_game_row( row )
|
27
|
+
unless attributes.nil?
|
28
|
+
attributes.merge!( division: division, category: category.to_s, competition: competition )
|
29
|
+
yield attributes if block_given?
|
30
|
+
games << attributes
|
31
|
+
end # end unless attributes nil
|
32
|
+
end # end division tap
|
33
|
+
end # if division.nil?
|
34
|
+
end # end inject
|
35
|
+
end # end games tap
|
36
|
+
end # end games
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
CATEGORY_ID = 'ContentPlaceHolder1_lblTitle'
|
41
|
+
|
42
|
+
def extract_category_and_competition
|
43
|
+
value = document.css( "##{CATEGORY_ID}" ).first.text
|
44
|
+
values = value.split( '-', 2 ).map { |s| s.strip }
|
45
|
+
|
46
|
+
@category = values.first.gsub( /^wedstrijden /i, '' ).to_sym
|
47
|
+
@competition = values.last
|
48
|
+
|
49
|
+
[ @category, @competition ]
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_game_row( row )
|
53
|
+
mapping = case row.css( 'td' ).count
|
54
|
+
when 10
|
55
|
+
[ :code, nil, :date, :time, :home_team, :visiting_team, nil, :referee, nil, :location ]
|
56
|
+
when 11
|
57
|
+
[ :code, nil, :date, :time, :home_team, :visiting_team, :home_score, :visiting_score, :referee, nil, :location ]
|
58
|
+
else
|
59
|
+
raise "Unparseable row encountered: #{row.to_s}"
|
60
|
+
end # end case
|
61
|
+
|
62
|
+
values = row.css( 'td' ).map do |td|
|
63
|
+
td.css( 'a' ).any? ? [ td.css( 'a' ).first.text, resolve_uri( td.css( 'a' ).first[ 'href' ] ) ] : td.text
|
64
|
+
end
|
65
|
+
|
66
|
+
values = mapping.zip values
|
67
|
+
values.delete_if { |key,value| key.nil? }
|
68
|
+
values = Hash[ values ]
|
69
|
+
game_is_complete?( values ) ? cleanup_game_attributes( values ) : nil
|
70
|
+
end #end parse_game_row
|
71
|
+
|
72
|
+
def game_is_complete?( data )
|
73
|
+
[ :home_team, :visiting_team, :date, :time ].all? { |k| !data[ k ].nil? && data[k].to_s.strip != '' }
|
74
|
+
end
|
75
|
+
|
76
|
+
def cleanup_game_attributes( data )
|
77
|
+
data.clone.tap do |data|
|
78
|
+
# Fixing the date
|
79
|
+
day, month, year = data.delete( :date ).split( '/', 3 ).map { |i| i.to_i }
|
80
|
+
hour, minutes = data.delete( :time ).split( ':', 2 ).map { |i| i.to_i }
|
81
|
+
data[ :starts_at ] = Time.local( year, month, day, hour, minutes, 0 )
|
82
|
+
|
83
|
+
# Cleanup the teams and the game code
|
84
|
+
[ :home_team, :visiting_team, :code, :referee ].each do |key|
|
85
|
+
data[ key ] = data[ key ].strip.gsub( /\s+/, ' ' )
|
86
|
+
end
|
87
|
+
|
88
|
+
# Cleanup the location name
|
89
|
+
if data.key?( :location )
|
90
|
+
data[ :location ][0] = data[ :location ][0].strip.gsub( /\s+/, ' ' )
|
91
|
+
end
|
92
|
+
|
93
|
+
# Combine the teams
|
94
|
+
data[ :teams ] = [ data.delete( :home_team ), data.delete( :visiting_team ) ]
|
95
|
+
|
96
|
+
# Combine the scores if there are any
|
97
|
+
if data.key?( :home_score )
|
98
|
+
forfait = data[ :visiting_score ] =~ /ff/i
|
99
|
+
home_score = data.delete( :home_score ).gsub( /[^0-9]+/, '' ).to_i
|
100
|
+
visiting_score = data.delete( :visiting_score ).gsub( /[^0-9]+/, '' ).to_i
|
101
|
+
data[ :scores ] = [ home_score, visiting_score ]
|
102
|
+
data[ :forfait ] = true if forfait
|
103
|
+
end
|
104
|
+
|
105
|
+
# Remove empty fields
|
106
|
+
[ :referee, :location ].each { |key| data.delete( key ) if data[ key ].nil? || data[ key ] =~ /^\s*$/ }
|
107
|
+
end # end tap
|
108
|
+
end #end cleanup_game_attributes
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
require 'korfzone/scraper/block_page'
|
5
|
+
require 'korfzone/scraper/games_page'
|
6
|
+
|
7
|
+
module Korfzone
|
8
|
+
|
9
|
+
module Scraper
|
10
|
+
|
11
|
+
class Page
|
12
|
+
|
13
|
+
include BlockPage
|
14
|
+
include GamesPage
|
15
|
+
|
16
|
+
attr_accessor :url
|
17
|
+
|
18
|
+
def initialize( url )
|
19
|
+
@url = URI( URI.escape url.to_s )
|
20
|
+
end
|
21
|
+
|
22
|
+
def document
|
23
|
+
@document ||= Nokogiri::HTML( ::Korfzone::Scraper.fetch url )
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==( other_page )
|
27
|
+
other_page.respond_to?( :url ) && other_page.url == url
|
28
|
+
end
|
29
|
+
|
30
|
+
protected
|
31
|
+
|
32
|
+
def resolve_uri( uri )
|
33
|
+
uri if uri.to_s =~ /^http/
|
34
|
+
@url.clone.tap do |resolved_uri|
|
35
|
+
resolved_uri.path = URI.escape( uri.to_s )
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.for_category( category )
|
40
|
+
::Korfzone::Scraper::CATEGORIES[ category ].values.map { |url| Page.new url }
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "korfzone/scraper/version"
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module Korfzone
|
6
|
+
|
7
|
+
# The module containing all scraping related code.
|
8
|
+
#
|
9
|
+
# The games on the korfbal.be website are scattered across a number of pages. A first division is made
|
10
|
+
# on the category of the game ('senioren', 'gewestelijken' or 'jeugd'). A second division is made on the
|
11
|
+
# competition of the game ('veld', 'zaal', 'beker', ... ). The final division is into blocks based on the day
|
12
|
+
# of the match.
|
13
|
+
#
|
14
|
+
# There is no simple way to scrape the uri's to the different pages that can act as starting point to scrape a competition,
|
15
|
+
# therefore the starting points are listed explicitly in {Korfzone::Scraper::CATEGORIES}. The pages listed there
|
16
|
+
# have a link to all blocks for that competition.
|
17
|
+
#
|
18
|
+
# * {Korfzone::Scraper::Page} is the main class. It represents a single scrapeable page on korfbal.be.
|
19
|
+
# * {Korfzone::Scraper::BlockPage} contains all functionality to extract the different game day pages.
|
20
|
+
# * {Korfzone::Scraper::GamesPage} contains all functionality to extract games from a page.
|
21
|
+
#
|
22
|
+
# @author Rutger Claes <rutger@korfzone.be>
|
23
|
+
module Scraper
|
24
|
+
|
25
|
+
# Method to fetch content, introduced to improve testability
|
26
|
+
#
|
27
|
+
# @param uri [#to_s] The uri to fetch
|
28
|
+
# @return the contents of the uri
|
29
|
+
def self.fetch( uri )
|
30
|
+
open( uri.to_s )
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
require 'korfzone/scraper/categories'
|
38
|
+
require 'korfzone/scraper/page'
|
@@ -0,0 +1,224 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Korfzone::Scraper::Page do
|
4
|
+
|
5
|
+
it "should download and parse its uri" do
|
6
|
+
uri = 'http://www.korfbal.be/beta/Wedstrijden/senioren/veld'
|
7
|
+
stub_korfzone_fetch uri
|
8
|
+
page = Korfzone::Scraper::Page.new uri
|
9
|
+
expect( page.document ).not_to be_nil
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "containing links to other blocks" do
|
13
|
+
|
14
|
+
it "should list all blocks as uri" do
|
15
|
+
uri = 'http://www.korfbal.be/beta/Wedstrijden/senioren/veld'
|
16
|
+
stub_korfzone_fetch uri
|
17
|
+
page = Korfzone::Scraper::Page.new uri
|
18
|
+
expect( page.block_uris ).to eq( BLOCK_URIS )
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should list all blocks as page" do
|
22
|
+
uri = 'http://www.korfbal.be/beta/Wedstrijden/senioren/veld'
|
23
|
+
stub_korfzone_fetch uri
|
24
|
+
page = Korfzone::Scraper::Page.new uri
|
25
|
+
expect( page.block_pages ).to eq( BLOCK_URIS.map { |u| Korfzone::Scraper::Page.new u } )
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
shared_examples_for "successful scraping" do
|
31
|
+
|
32
|
+
it "should load the dom tree" do
|
33
|
+
expect( page.document.to_s ).to match( /#{code}/i )
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
shared_examples_for "basic game" do
|
40
|
+
|
41
|
+
it "should extract the game" do
|
42
|
+
expect( game ).not_to be_nil
|
43
|
+
expect( game ).to be_kind_of Hash
|
44
|
+
end
|
45
|
+
|
46
|
+
[ :code, :starts_at, :teams, :division, :category ].each do |attribute|
|
47
|
+
it "should extract the games #{attribute}" do
|
48
|
+
expect( game[ attribute ] ).to eq( game_attributes[ attribute.to_s ] )
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
shared_context "without referee" do
|
55
|
+
|
56
|
+
it "should not have a referee" do
|
57
|
+
expect( game.keys ).not_to include( :referee )
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
shared_context "with referee" do
|
63
|
+
|
64
|
+
it "should have a referee key" do
|
65
|
+
expect( game.keys ).to include( :referee )
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should extract the correct referee" do
|
69
|
+
expect( game[ :referee ] ).to eq( game_attributes[ 'referee' ] )
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
shared_context "without location" do
|
75
|
+
|
76
|
+
it "should not have a location" do
|
77
|
+
expect( game.keys ).not_to include( :location )
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
shared_context "with location" do
|
83
|
+
|
84
|
+
it "should have a location attribute" do
|
85
|
+
expect( game.keys ).to include( :location )
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should extract the location name" do
|
89
|
+
expect( game[ :location ].first ).to eq( game_attributes[ 'location' ].first )
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should extract the location url" do
|
93
|
+
expect( game[ :location ].last.to_s ).to eq( game_attributes[ 'location' ].last.to_s )
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
shared_context "in the future" do
|
99
|
+
|
100
|
+
it "should not have a score" do
|
101
|
+
expect( game.keys ).not_to include( :scores )
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should not have a forfait" do
|
105
|
+
expect( game.keys ).not_to include( :forfait )
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
shared_context "already played" do
|
111
|
+
|
112
|
+
it "should have a score" do
|
113
|
+
expect( game.keys ).to include( :scores )
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should extract the score" do
|
117
|
+
expect( game[ :scores ] ).to eq( game_attributes[ 'scores' ] )
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
shared_context "with forfait" do
|
123
|
+
|
124
|
+
it "should have key forfait" do
|
125
|
+
expect( game.keys ).to include( :forfait )
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should have the forfait set to true" do
|
129
|
+
expect( game[ :forfait ] ).to be_true
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
shared_context "without forfait" do
|
135
|
+
|
136
|
+
it "should not have key forfait" do
|
137
|
+
expect( game.keys ).not_to include( :forfait )
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
describe "page containing game" do
|
144
|
+
|
145
|
+
let( :page_uri ) { load_game( code ).first }
|
146
|
+
let( :game_attributes ) { load_game( code ).last }
|
147
|
+
let( :page ) { stub_korfzone_fetch( page_uri ); Korfzone::Scraper::Page.new( page_uri ) }
|
148
|
+
let( :game ) { page.games.find { |game| game[ :code ] == code } }
|
149
|
+
|
150
|
+
describe "V011001" do
|
151
|
+
let( :code ) { 'V011001' }
|
152
|
+
|
153
|
+
it_behaves_like "successful scraping"
|
154
|
+
it_behaves_like "basic game"
|
155
|
+
it_behaves_like "in the future"
|
156
|
+
it_behaves_like "without referee"
|
157
|
+
it_behaves_like "with location"
|
158
|
+
end
|
159
|
+
|
160
|
+
describe "V011002" do
|
161
|
+
let( :code ) { 'V011002' }
|
162
|
+
|
163
|
+
it_behaves_like "successful scraping"
|
164
|
+
it_behaves_like "basic game"
|
165
|
+
it_behaves_like "in the future"
|
166
|
+
it_behaves_like "with referee"
|
167
|
+
it_behaves_like "with location"
|
168
|
+
end
|
169
|
+
|
170
|
+
describe "V011003" do
|
171
|
+
let( :code ) { 'V011003' }
|
172
|
+
|
173
|
+
it_behaves_like "successful scraping"
|
174
|
+
it_behaves_like "basic game"
|
175
|
+
it_behaves_like "in the future"
|
176
|
+
it_behaves_like "without referee"
|
177
|
+
it_behaves_like "without location"
|
178
|
+
end
|
179
|
+
|
180
|
+
describe "V011005" do
|
181
|
+
let( :code ) { 'V011005' }
|
182
|
+
|
183
|
+
it_behaves_like "successful scraping"
|
184
|
+
it_behaves_like "basic game"
|
185
|
+
it_behaves_like "already played"
|
186
|
+
it_behaves_like "without forfait"
|
187
|
+
it_behaves_like "without referee"
|
188
|
+
it_behaves_like "with location"
|
189
|
+
end
|
190
|
+
|
191
|
+
describe "V011006" do
|
192
|
+
let( :code ) { 'V011006' }
|
193
|
+
|
194
|
+
it_behaves_like "successful scraping"
|
195
|
+
it_behaves_like "basic game"
|
196
|
+
it_behaves_like "already played"
|
197
|
+
it_behaves_like "without forfait"
|
198
|
+
it_behaves_like "with referee"
|
199
|
+
it_behaves_like "with location"
|
200
|
+
end
|
201
|
+
|
202
|
+
describe "V011007" do
|
203
|
+
let( :code ) { 'V011007' }
|
204
|
+
|
205
|
+
it_behaves_like "successful scraping"
|
206
|
+
it_behaves_like "basic game"
|
207
|
+
it_behaves_like "already played"
|
208
|
+
it_behaves_like "with forfait"
|
209
|
+
it_behaves_like "without referee"
|
210
|
+
it_behaves_like "with location"
|
211
|
+
end
|
212
|
+
|
213
|
+
describe "V011004" do
|
214
|
+
let( :code ) { 'V011004' }
|
215
|
+
|
216
|
+
it_behaves_like "successful scraping"
|
217
|
+
|
218
|
+
it "should not include a free match" do
|
219
|
+
expect( page.games ).to be_empty
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
end
|
data/spec/spec_helper.rb
ADDED