scrap_cbf 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/scrap_cbf.rb +92 -0
- data/lib/scrap_cbf/builders/matches_builder.rb +21 -0
- data/lib/scrap_cbf/builders/matches_per_round_builder.rb +113 -0
- data/lib/scrap_cbf/builders/rankings_builder.rb +118 -0
- data/lib/scrap_cbf/builders/rounds_builder.rb +75 -0
- data/lib/scrap_cbf/builders/teams_builder.rb +78 -0
- data/lib/scrap_cbf/document.rb +93 -0
- data/lib/scrap_cbf/errors.rb +51 -0
- data/lib/scrap_cbf/formattable.rb +16 -0
- data/lib/scrap_cbf/helpers/lib/depth_search.rb +37 -0
- data/lib/scrap_cbf/helpers/lib/element_nokogiri.rb +17 -0
- data/lib/scrap_cbf/helpers/lib/findable.rb +33 -0
- data/lib/scrap_cbf/helpers/matches_helper.rb +136 -0
- data/lib/scrap_cbf/helpers/rankings_helper.rb +19 -0
- data/lib/scrap_cbf/helpers/teams_helper.rb +11 -0
- data/lib/scrap_cbf/models/championship.rb +26 -0
- data/lib/scrap_cbf/models/match.rb +38 -0
- data/lib/scrap_cbf/models/ranking.rb +45 -0
- data/lib/scrap_cbf/models/round.rb +33 -0
- data/lib/scrap_cbf/models/table/cell.rb +10 -0
- data/lib/scrap_cbf/models/table/header_column.rb +11 -0
- data/lib/scrap_cbf/models/table/row.rb +10 -0
- data/lib/scrap_cbf/models/team.rb +19 -0
- data/lib/scrap_cbf/printable.rb +19 -0
- data/lib/scrap_cbf/samples/cbf_serie_a_2020.html +17330 -0
- data/lib/scrap_cbf/version.rb +5 -0
- metadata +193 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 468506451d0e8869e39a079ddaacf33a8aacc1e2a79052cdb251611217874c0d
|
4
|
+
data.tar.gz: 05a8c5cc7e71b8323ee4e3bd73ae723519a24ffe357e392910c044186496b586
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ff2bad7d68f58a68d8499c34f175bac2231616f41c99190fa91f80e0a8ed1ecf59fe7717a81bc3a86cefe73fdefaa52028248a6f5f7456a4f40ad85ef1e468b4
|
7
|
+
data.tar.gz: 01bc568cdca4a5ccd530bdafd42706b45724f5cd1f6bb01db23182ea8f9ea74eba4e031f7c0603fcefadfba900b6643616486e8499c354c13ba578760096da83
|
data/lib/scrap_cbf.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'json'
|
6
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
7
|
+
require 'forwardable'
|
8
|
+
|
9
|
+
require_relative 'scrap_cbf/errors'
|
10
|
+
require_relative 'scrap_cbf/formattable'
|
11
|
+
require_relative 'scrap_cbf/printable'
|
12
|
+
require_relative 'scrap_cbf/document'
|
13
|
+
require_relative 'scrap_cbf/models/table/header_column'
|
14
|
+
require_relative 'scrap_cbf/models/table/row'
|
15
|
+
require_relative 'scrap_cbf/models/table/cell'
|
16
|
+
require_relative 'scrap_cbf/models/championship'
|
17
|
+
require_relative 'scrap_cbf/models/team'
|
18
|
+
require_relative 'scrap_cbf/models/round'
|
19
|
+
require_relative 'scrap_cbf/models/match'
|
20
|
+
require_relative 'scrap_cbf/models/ranking'
|
21
|
+
require_relative 'scrap_cbf/helpers/matches_helper'
|
22
|
+
require_relative 'scrap_cbf/helpers/rankings_helper'
|
23
|
+
require_relative 'scrap_cbf/helpers/teams_helper'
|
24
|
+
require_relative 'scrap_cbf/builders/matches_builder'
|
25
|
+
require_relative 'scrap_cbf/builders/matches_per_round_builder'
|
26
|
+
require_relative 'scrap_cbf/builders/rankings_builder'
|
27
|
+
require_relative 'scrap_cbf/builders/rounds_builder'
|
28
|
+
require_relative 'scrap_cbf/builders/teams_builder'
|
29
|
+
|
30
|
+
# ScrapCbf is a gem created for scraping data from the CBF official page.
|
31
|
+
# Some of the data found on the CBF page are:
|
32
|
+
# teams, matches, rounds and ranking table from all championships founded
|
33
|
+
# on the official page.
|
34
|
+
class ScrapCbf
|
35
|
+
include Formattable
|
36
|
+
include Printable
|
37
|
+
|
38
|
+
# @!attribute [r] document
|
39
|
+
# @return [ScrapCbf::Document] ScrapCbf::Document instance.
|
40
|
+
# @!attribute [r] championship
|
41
|
+
# @return [ScrapCbf::Championship] ScrapCbf::Championship instance.
|
42
|
+
attr_reader :document
|
43
|
+
attr_reader :championship
|
44
|
+
|
45
|
+
# @param [Hash] opts
|
46
|
+
# @option opts [Integer] :year The Championship year.
|
47
|
+
# @option opts [Symbol] :serie The Championship serie.
|
48
|
+
# @option opts [Symbol] :load_from_sample Load championship from sample.
|
49
|
+
# @option opts [Symbol] :sample_path to the sample otherwise default
|
50
|
+
#
|
51
|
+
# @return [ScrapCbf] new instance
|
52
|
+
def initialize(opts = {})
|
53
|
+
year = opts.fetch(:year, Date.today.year.to_i)
|
54
|
+
serie = opts.fetch(:serie, :serie_a)
|
55
|
+
|
56
|
+
@document = Document.new(year, serie, opts)
|
57
|
+
@parsed_document = @document.parsed_document
|
58
|
+
@championship = Championship.new(year, serie)
|
59
|
+
end
|
60
|
+
|
61
|
+
# returns all entities scraped in hash format.
|
62
|
+
def to_h
|
63
|
+
{
|
64
|
+
championship: championship.to_h,
|
65
|
+
matches: matches.to_h,
|
66
|
+
rankings: rankings.to_h,
|
67
|
+
rounds: rounds.to_h,
|
68
|
+
teams: teams.to_h
|
69
|
+
|
70
|
+
}.with_indifferent_access
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [MatchesBuilder] instance.
|
74
|
+
def matches
|
75
|
+
@matches ||= rounds.matches_builder
|
76
|
+
end
|
77
|
+
|
78
|
+
# @return [RankingsBuilder] instance.
|
79
|
+
def rankings
|
80
|
+
@rankings ||= RankingsBuilder.new(@parsed_document, @championship)
|
81
|
+
end
|
82
|
+
|
83
|
+
# @return [RoundsBuilder] instance.
|
84
|
+
def rounds
|
85
|
+
@rounds ||= RoundsBuilder.new(@parsed_document, @championship)
|
86
|
+
end
|
87
|
+
|
88
|
+
# @return [TeamsBuilder] instance.
|
89
|
+
def teams
|
90
|
+
@teams ||= TeamsBuilder.new(@parsed_document)
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class MatchesBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include Formattable
|
7
|
+
include Printable
|
8
|
+
|
9
|
+
delegate %i[each map] => :@matches
|
10
|
+
|
11
|
+
attr_accessor :matches
|
12
|
+
|
13
|
+
def initialize(matches)
|
14
|
+
@matches = matches
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_h
|
18
|
+
@matches.map(&:to_h)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class MatchesPerRoundBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include MatchesHelper
|
7
|
+
include Formattable
|
8
|
+
include Printable
|
9
|
+
|
10
|
+
delegate %i[each map] => :@matches
|
11
|
+
|
12
|
+
attr_reader :matches
|
13
|
+
alias all matches
|
14
|
+
|
15
|
+
def initialize(matches_elements, round_number, championship)
|
16
|
+
@championship = championship
|
17
|
+
@matches = []
|
18
|
+
|
19
|
+
scrap_matches(matches_elements, round_number)
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_h
|
23
|
+
@matches.map(&:to_h)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def scrap_matches(matches_elements, round_number)
|
29
|
+
matches_elements.children.each do |match_element|
|
30
|
+
next unless match_element.element?
|
31
|
+
|
32
|
+
@matches << scrap_match(match_element, round_number)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def scrap_match(match_element, round_number)
|
37
|
+
match = Match.new
|
38
|
+
match.championship = @championship.year
|
39
|
+
match.serie = @championship.serie
|
40
|
+
|
41
|
+
match.round = round_number
|
42
|
+
|
43
|
+
# e.g "Qua, 03/02/2021 16:00 - Jogo: 336"
|
44
|
+
scrap_info(match, match_element)
|
45
|
+
# e.g <img title="team-name">
|
46
|
+
scrap_teams(match, match_element)
|
47
|
+
# e.g "1 alteração" (can be undefined)
|
48
|
+
scrap_update(match, match_element)
|
49
|
+
# e.g "16:00" (can be found in two places, we take only the first)
|
50
|
+
scrap_start_at(match, match_element)
|
51
|
+
# e.g "1 x 1" (can be undefined)
|
52
|
+
scrap_score(match, match_element)
|
53
|
+
# e.g "Vila Belmiro - Santos - SP" (can be undefined)
|
54
|
+
scrap_place(match, match_element)
|
55
|
+
|
56
|
+
match
|
57
|
+
end
|
58
|
+
|
59
|
+
def scrap_info(match, match_element)
|
60
|
+
info = find_info_helper(match_element)
|
61
|
+
return unless info
|
62
|
+
|
63
|
+
# e.g "Jogo: 336" (always defined)
|
64
|
+
match.id_match = info[/Jogo: \d{1,3}$/i].gsub(/^Jogo: /, '')
|
65
|
+
# e.g "03/02/2021" (can be undefined)
|
66
|
+
match.date = info[%r{\d{2}/\d{2}/\d{2,4}}i]
|
67
|
+
# e.g "16:00" (can be undefined)
|
68
|
+
match.start_at = info[/\d{2}:\d{2}/i]
|
69
|
+
end
|
70
|
+
|
71
|
+
def scrap_place(match, match_element)
|
72
|
+
match.place = find_place_helper(match_element)
|
73
|
+
end
|
74
|
+
|
75
|
+
def scrap_score(match, match_element)
|
76
|
+
score = find_score_helper(match_element)
|
77
|
+
|
78
|
+
match.team_score = score_by_team_helper(:team, score)
|
79
|
+
match.opponent_score = score_by_team_helper(:opponent, score)
|
80
|
+
end
|
81
|
+
|
82
|
+
def scrap_update(match, match_element)
|
83
|
+
match.updates = find_updates_helper(match_element)
|
84
|
+
end
|
85
|
+
|
86
|
+
def scrap_teams(match, match_element)
|
87
|
+
teams_elements = match_element.css('img')
|
88
|
+
|
89
|
+
unless teams_elements.length == 2
|
90
|
+
raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
|
91
|
+
end
|
92
|
+
|
93
|
+
teams_name = teams_elements.map do |team_element|
|
94
|
+
next unless team_element.element?
|
95
|
+
|
96
|
+
scrap_team_names_helper(team_element)
|
97
|
+
end
|
98
|
+
|
99
|
+
unless teams_name.length == 2
|
100
|
+
raise InvalidNumberOfEntitiesError.new(:team, teams_name.length)
|
101
|
+
end
|
102
|
+
|
103
|
+
match.team = teams_name[0]
|
104
|
+
match.opponent = teams_name[1]
|
105
|
+
end
|
106
|
+
|
107
|
+
def scrap_start_at(match, match_element)
|
108
|
+
match.start_at = find_start_at_helper(match_element) unless match.start_at
|
109
|
+
datetime = date_with_start_at_helper(match.date, match.start_at)
|
110
|
+
match.date = datetime if datetime
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class RankingsBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include RankingsHelper
|
7
|
+
include Formattable
|
8
|
+
include Printable
|
9
|
+
|
10
|
+
delegate [:each] => :@rankings
|
11
|
+
|
12
|
+
def initialize(document, championship)
|
13
|
+
@championship = championship
|
14
|
+
@rankings = []
|
15
|
+
@header = []
|
16
|
+
@rows = []
|
17
|
+
|
18
|
+
tables = document.css('table')
|
19
|
+
table = find_table_by_header(tables, Ranking::TABLE_HEADER)
|
20
|
+
|
21
|
+
return unless table
|
22
|
+
|
23
|
+
scrap_rankings(table)
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_h
|
27
|
+
@rankings.map(&:to_h)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def scrap_rankings(table)
|
33
|
+
scrap_header(table)
|
34
|
+
scrap_body(table)
|
35
|
+
create_rankings_from_table
|
36
|
+
end
|
37
|
+
|
38
|
+
def scrap_header(table)
|
39
|
+
table.css('thead > tr > th').each do |th|
|
40
|
+
text = th.element? && remove_whitespace(th)
|
41
|
+
next unless text
|
42
|
+
|
43
|
+
title = title_or_nil_helper(th)
|
44
|
+
|
45
|
+
@header << HeaderColumn.new(text, title)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def scrap_body(table)
|
50
|
+
table.css('tbody > tr').each do |tr_element|
|
51
|
+
next if tr_element.element? && element_hidden?(tr_element)
|
52
|
+
|
53
|
+
row = Row.new
|
54
|
+
tr_element.children.each do |td_element|
|
55
|
+
text = td_element.element? && remove_whitespace(td_element)
|
56
|
+
|
57
|
+
next unless text
|
58
|
+
|
59
|
+
text = scrap_position_if_exist(text)
|
60
|
+
|
61
|
+
team = scrap_team_name_if_exist(td_element)
|
62
|
+
|
63
|
+
# First cell (e.g posicao: 7º and team: Fluminense)
|
64
|
+
if text && !text.empty? && team && !team.empty?
|
65
|
+
row.cells << Cell.new(text)
|
66
|
+
row.cells << Cell.new(team)
|
67
|
+
elsif team && !team.empty?
|
68
|
+
row.cells << Cell.new(team)
|
69
|
+
else
|
70
|
+
row.cells << Cell.new(text)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add 1 to header length because on first cell we scrap 2 values
|
75
|
+
row_length = row.cells.length
|
76
|
+
header_length = @header.length + 1
|
77
|
+
unless row_length == header_length
|
78
|
+
raise RowSizeError.new(row_length, header_length)
|
79
|
+
end
|
80
|
+
|
81
|
+
@rows << row
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def scrap_position_if_exist(text)
|
86
|
+
if text&.match?(/^\d{1,2}º/i)
|
87
|
+
position = text[/^\d{1,2}º/i].strip
|
88
|
+
return position.delete 'º'
|
89
|
+
end
|
90
|
+
|
91
|
+
text
|
92
|
+
end
|
93
|
+
|
94
|
+
def scrap_team_name_if_exist(element)
|
95
|
+
title = title_or_nil_helper(element)
|
96
|
+
|
97
|
+
return unless title&.match?(/^[a-záàâãéèêíïóôõöúç\s\-]+ - [a-z]{2}$/i)
|
98
|
+
|
99
|
+
title[/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
|
100
|
+
end
|
101
|
+
|
102
|
+
def create_rankings_from_table
|
103
|
+
@rows.each do |row|
|
104
|
+
ranking = Ranking.new
|
105
|
+
ranking.championship = @championship.year
|
106
|
+
ranking.serie = @championship.serie
|
107
|
+
|
108
|
+
attrs_rank = Ranking::ATTRS_RANK
|
109
|
+
|
110
|
+
row.cells.each_with_index do |cell, idx|
|
111
|
+
ranking.send "#{attrs_rank[idx]}=", cell.value
|
112
|
+
end
|
113
|
+
|
114
|
+
@rankings << ranking
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class RoundsBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include Formattable
|
7
|
+
include Printable
|
8
|
+
|
9
|
+
delegate [:each] => :@rounds
|
10
|
+
|
11
|
+
def initialize(document, championship)
|
12
|
+
@championship = championship
|
13
|
+
@rounds = []
|
14
|
+
|
15
|
+
scrap_rounds(document)
|
16
|
+
end
|
17
|
+
|
18
|
+
def matches_builder
|
19
|
+
matches = @rounds.reduce([]) do |arr, round|
|
20
|
+
matches_per_round = round.matches
|
21
|
+
arr.push(*matches_per_round.all)
|
22
|
+
end
|
23
|
+
|
24
|
+
MatchesBuilder.new(matches)
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_h
|
28
|
+
@rounds.map(&:to_h)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def scrap_rounds(rounds_elements)
|
34
|
+
(0..37).each do |round_number|
|
35
|
+
round_element = rounds_elements.css(
|
36
|
+
"div[data-slide-index=#{round_number}]"
|
37
|
+
)
|
38
|
+
|
39
|
+
round_element.children.each do |element|
|
40
|
+
next unless element.element? && element.name == 'div'
|
41
|
+
|
42
|
+
round = scrap_round(element, round_number)
|
43
|
+
|
44
|
+
@rounds << round
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def scrap_round(round_element, round_number)
|
50
|
+
round = Round.new
|
51
|
+
round.championship = @championship.year
|
52
|
+
round.serie = @championship.serie
|
53
|
+
|
54
|
+
# Because index starts on zero, we add 1 for matching with Rounds ID
|
55
|
+
round.number = round_number + 1
|
56
|
+
scrap_matches(round, round_element)
|
57
|
+
|
58
|
+
round
|
59
|
+
end
|
60
|
+
|
61
|
+
def scrap_matches(round, round_element)
|
62
|
+
round_element.children.each do |element|
|
63
|
+
# matches are founded on <ul>
|
64
|
+
next unless element.element? && element.name == 'ul'
|
65
|
+
|
66
|
+
round.matches = MatchesPerRoundBuilder.new(
|
67
|
+
element,
|
68
|
+
round.number,
|
69
|
+
@championship
|
70
|
+
)
|
71
|
+
end
|
72
|
+
round
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class TeamsBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include TeamsHelper
|
7
|
+
include Formattable
|
8
|
+
include Printable
|
9
|
+
|
10
|
+
delegate [:each] => :@teams
|
11
|
+
|
12
|
+
def initialize(document)
|
13
|
+
@teams = []
|
14
|
+
|
15
|
+
tables = document.css('table')
|
16
|
+
table = find_table_by_header(
|
17
|
+
tables,
|
18
|
+
Ranking::TABLE_HEADER
|
19
|
+
)
|
20
|
+
|
21
|
+
scrap_teams(table)
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_h
|
25
|
+
@teams.map(&:to_h)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def scrap_teams(table)
|
31
|
+
table.css('tbody > tr').each do |tr_element|
|
32
|
+
# Remove the rows that are invisible by default
|
33
|
+
next if tr_element.element? && element_hidden?(tr_element)
|
34
|
+
|
35
|
+
teams_elements = tr_element.css('img')
|
36
|
+
|
37
|
+
# two teams are found in a row: team and next opponent
|
38
|
+
# the last one may be not present
|
39
|
+
unless teams_elements.length >= 1 && teams_elements.length <= 2
|
40
|
+
raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
|
41
|
+
end
|
42
|
+
|
43
|
+
# only the first team is scraped
|
44
|
+
team_element = teams_elements.first
|
45
|
+
|
46
|
+
@teams << scrap_team(team_element)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def scrap_team(team_element)
|
51
|
+
team = Team.new
|
52
|
+
|
53
|
+
if team_element&.key?('title') &&
|
54
|
+
team_element['title'].match?(/^[a-záàâãéèêíïóôõöúç\s]+ - [a-z]{2}$/i)
|
55
|
+
|
56
|
+
scrap_name(team, team_element)
|
57
|
+
scrap_state(team, team_element)
|
58
|
+
scrap_avatar_url(team, team_element)
|
59
|
+
end
|
60
|
+
|
61
|
+
team
|
62
|
+
end
|
63
|
+
|
64
|
+
def scrap_name(team, team_element)
|
65
|
+
# e.g "Santos"
|
66
|
+
team.name = team_element['title'][/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
|
67
|
+
end
|
68
|
+
|
69
|
+
def scrap_state(team, team_element)
|
70
|
+
# e.g "SP"
|
71
|
+
team.state = team_element['title'][/[a-z]{2}$/i]
|
72
|
+
end
|
73
|
+
|
74
|
+
def scrap_avatar_url(team, team_element)
|
75
|
+
team.avatar_url = team_element['src'] if team_element.key?('src')
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|