scrap_cbf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/scrap_cbf.rb +92 -0
- data/lib/scrap_cbf/builders/matches_builder.rb +21 -0
- data/lib/scrap_cbf/builders/matches_per_round_builder.rb +113 -0
- data/lib/scrap_cbf/builders/rankings_builder.rb +118 -0
- data/lib/scrap_cbf/builders/rounds_builder.rb +75 -0
- data/lib/scrap_cbf/builders/teams_builder.rb +78 -0
- data/lib/scrap_cbf/document.rb +93 -0
- data/lib/scrap_cbf/errors.rb +51 -0
- data/lib/scrap_cbf/formattable.rb +16 -0
- data/lib/scrap_cbf/helpers/lib/depth_search.rb +37 -0
- data/lib/scrap_cbf/helpers/lib/element_nokogiri.rb +17 -0
- data/lib/scrap_cbf/helpers/lib/findable.rb +33 -0
- data/lib/scrap_cbf/helpers/matches_helper.rb +136 -0
- data/lib/scrap_cbf/helpers/rankings_helper.rb +19 -0
- data/lib/scrap_cbf/helpers/teams_helper.rb +11 -0
- data/lib/scrap_cbf/models/championship.rb +26 -0
- data/lib/scrap_cbf/models/match.rb +38 -0
- data/lib/scrap_cbf/models/ranking.rb +45 -0
- data/lib/scrap_cbf/models/round.rb +33 -0
- data/lib/scrap_cbf/models/table/cell.rb +10 -0
- data/lib/scrap_cbf/models/table/header_column.rb +11 -0
- data/lib/scrap_cbf/models/table/row.rb +10 -0
- data/lib/scrap_cbf/models/team.rb +19 -0
- data/lib/scrap_cbf/printable.rb +19 -0
- data/lib/scrap_cbf/samples/cbf_serie_a_2020.html +17330 -0
- data/lib/scrap_cbf/version.rb +5 -0
- metadata +193 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 468506451d0e8869e39a079ddaacf33a8aacc1e2a79052cdb251611217874c0d
|
4
|
+
data.tar.gz: 05a8c5cc7e71b8323ee4e3bd73ae723519a24ffe357e392910c044186496b586
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ff2bad7d68f58a68d8499c34f175bac2231616f41c99190fa91f80e0a8ed1ecf59fe7717a81bc3a86cefe73fdefaa52028248a6f5f7456a4f40ad85ef1e468b4
|
7
|
+
data.tar.gz: 01bc568cdca4a5ccd530bdafd42706b45724f5cd1f6bb01db23182ea8f9ea74eba4e031f7c0603fcefadfba900b6643616486e8499c354c13ba578760096da83
|
data/lib/scrap_cbf.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'json'
|
6
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
7
|
+
require 'forwardable'
|
8
|
+
|
9
|
+
require_relative 'scrap_cbf/errors'
|
10
|
+
require_relative 'scrap_cbf/formattable'
|
11
|
+
require_relative 'scrap_cbf/printable'
|
12
|
+
require_relative 'scrap_cbf/document'
|
13
|
+
require_relative 'scrap_cbf/models/table/header_column'
|
14
|
+
require_relative 'scrap_cbf/models/table/row'
|
15
|
+
require_relative 'scrap_cbf/models/table/cell'
|
16
|
+
require_relative 'scrap_cbf/models/championship'
|
17
|
+
require_relative 'scrap_cbf/models/team'
|
18
|
+
require_relative 'scrap_cbf/models/round'
|
19
|
+
require_relative 'scrap_cbf/models/match'
|
20
|
+
require_relative 'scrap_cbf/models/ranking'
|
21
|
+
require_relative 'scrap_cbf/helpers/matches_helper'
|
22
|
+
require_relative 'scrap_cbf/helpers/rankings_helper'
|
23
|
+
require_relative 'scrap_cbf/helpers/teams_helper'
|
24
|
+
require_relative 'scrap_cbf/builders/matches_builder'
|
25
|
+
require_relative 'scrap_cbf/builders/matches_per_round_builder'
|
26
|
+
require_relative 'scrap_cbf/builders/rankings_builder'
|
27
|
+
require_relative 'scrap_cbf/builders/rounds_builder'
|
28
|
+
require_relative 'scrap_cbf/builders/teams_builder'
|
29
|
+
|
30
|
+
# ScrapCbf is a gem created for scraping data from the CBF official page.
|
31
|
+
# Some of the data found on the CBF page are:
|
32
|
+
# teams, matches, rounds and ranking table from all championships founded
|
33
|
+
# on the official page.
|
34
|
+
class ScrapCbf
|
35
|
+
include Formattable
|
36
|
+
include Printable
|
37
|
+
|
38
|
+
# @!attribute [r] document
|
39
|
+
# @return [ScrapCbf::Document] ScrapCbf::Document instance.
|
40
|
+
# @!attribute [r] championship
|
41
|
+
# @return [ScrapCbf::Championship] ScrapCbf::Championship instance.
|
42
|
+
attr_reader :document
|
43
|
+
attr_reader :championship
|
44
|
+
|
45
|
+
# @param [Hash] opts
|
46
|
+
# @option opts [Integer] :year The Championship year.
|
47
|
+
# @option opts [Symbol] :serie The Championship serie.
|
48
|
+
# @option opts [Symbol] :load_from_sample Load championship from sample.
|
49
|
+
# @option opts [Symbol] :sample_path to the sample otherwise default
|
50
|
+
#
|
51
|
+
# @return [ScrapCbf] new instance
|
52
|
+
def initialize(opts = {})
|
53
|
+
year = opts.fetch(:year, Date.today.year.to_i)
|
54
|
+
serie = opts.fetch(:serie, :serie_a)
|
55
|
+
|
56
|
+
@document = Document.new(year, serie, opts)
|
57
|
+
@parsed_document = @document.parsed_document
|
58
|
+
@championship = Championship.new(year, serie)
|
59
|
+
end
|
60
|
+
|
61
|
+
# returns all entities scraped in hash format.
|
62
|
+
def to_h
|
63
|
+
{
|
64
|
+
championship: championship.to_h,
|
65
|
+
matches: matches.to_h,
|
66
|
+
rankings: rankings.to_h,
|
67
|
+
rounds: rounds.to_h,
|
68
|
+
teams: teams.to_h
|
69
|
+
|
70
|
+
}.with_indifferent_access
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [MatchesBuilder] instance.
|
74
|
+
def matches
|
75
|
+
@matches ||= rounds.matches_builder
|
76
|
+
end
|
77
|
+
|
78
|
+
# @return [RankingsBuilder] instance.
|
79
|
+
def rankings
|
80
|
+
@rankings ||= RankingsBuilder.new(@parsed_document, @championship)
|
81
|
+
end
|
82
|
+
|
83
|
+
# @return [RoundsBuilder] instance.
|
84
|
+
def rounds
|
85
|
+
@rounds ||= RoundsBuilder.new(@parsed_document, @championship)
|
86
|
+
end
|
87
|
+
|
88
|
+
# @return [TeamsBuilder] instance.
|
89
|
+
def teams
|
90
|
+
@teams ||= TeamsBuilder.new(@parsed_document)
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class MatchesBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include Formattable
|
7
|
+
include Printable
|
8
|
+
|
9
|
+
delegate %i[each map] => :@matches
|
10
|
+
|
11
|
+
attr_accessor :matches
|
12
|
+
|
13
|
+
def initialize(matches)
|
14
|
+
@matches = matches
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_h
|
18
|
+
@matches.map(&:to_h)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class MatchesPerRoundBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include MatchesHelper
|
7
|
+
include Formattable
|
8
|
+
include Printable
|
9
|
+
|
10
|
+
delegate %i[each map] => :@matches
|
11
|
+
|
12
|
+
attr_reader :matches
|
13
|
+
alias all matches
|
14
|
+
|
15
|
+
def initialize(matches_elements, round_number, championship)
|
16
|
+
@championship = championship
|
17
|
+
@matches = []
|
18
|
+
|
19
|
+
scrap_matches(matches_elements, round_number)
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_h
|
23
|
+
@matches.map(&:to_h)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def scrap_matches(matches_elements, round_number)
|
29
|
+
matches_elements.children.each do |match_element|
|
30
|
+
next unless match_element.element?
|
31
|
+
|
32
|
+
@matches << scrap_match(match_element, round_number)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def scrap_match(match_element, round_number)
|
37
|
+
match = Match.new
|
38
|
+
match.championship = @championship.year
|
39
|
+
match.serie = @championship.serie
|
40
|
+
|
41
|
+
match.round = round_number
|
42
|
+
|
43
|
+
# e.g "Qua, 03/02/2021 16:00 - Jogo: 336"
|
44
|
+
scrap_info(match, match_element)
|
45
|
+
# e.g <img title="team-name">
|
46
|
+
scrap_teams(match, match_element)
|
47
|
+
# e.g "1 alteração" (can be undefined)
|
48
|
+
scrap_update(match, match_element)
|
49
|
+
# e.g "16:00" (can be found in two places, we take only the first)
|
50
|
+
scrap_start_at(match, match_element)
|
51
|
+
# e.g "1 x 1" (can be undefined)
|
52
|
+
scrap_score(match, match_element)
|
53
|
+
# e.g "Vila Belmiro - Santos - SP" (can be undefined)
|
54
|
+
scrap_place(match, match_element)
|
55
|
+
|
56
|
+
match
|
57
|
+
end
|
58
|
+
|
59
|
+
def scrap_info(match, match_element)
|
60
|
+
info = find_info_helper(match_element)
|
61
|
+
return unless info
|
62
|
+
|
63
|
+
# e.g "Jogo: 336" (always defined)
|
64
|
+
match.id_match = info[/Jogo: \d{1,3}$/i].gsub(/^Jogo: /, '')
|
65
|
+
# e.g "03/02/2021" (can be undefined)
|
66
|
+
match.date = info[%r{\d{2}/\d{2}/\d{2,4}}i]
|
67
|
+
# e.g "16:00" (can be undefined)
|
68
|
+
match.start_at = info[/\d{2}:\d{2}/i]
|
69
|
+
end
|
70
|
+
|
71
|
+
def scrap_place(match, match_element)
|
72
|
+
match.place = find_place_helper(match_element)
|
73
|
+
end
|
74
|
+
|
75
|
+
def scrap_score(match, match_element)
|
76
|
+
score = find_score_helper(match_element)
|
77
|
+
|
78
|
+
match.team_score = score_by_team_helper(:team, score)
|
79
|
+
match.opponent_score = score_by_team_helper(:opponent, score)
|
80
|
+
end
|
81
|
+
|
82
|
+
def scrap_update(match, match_element)
|
83
|
+
match.updates = find_updates_helper(match_element)
|
84
|
+
end
|
85
|
+
|
86
|
+
def scrap_teams(match, match_element)
|
87
|
+
teams_elements = match_element.css('img')
|
88
|
+
|
89
|
+
unless teams_elements.length == 2
|
90
|
+
raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
|
91
|
+
end
|
92
|
+
|
93
|
+
teams_name = teams_elements.map do |team_element|
|
94
|
+
next unless team_element.element?
|
95
|
+
|
96
|
+
scrap_team_names_helper(team_element)
|
97
|
+
end
|
98
|
+
|
99
|
+
unless teams_name.length == 2
|
100
|
+
raise InvalidNumberOfEntitiesError.new(:team, teams_name.length)
|
101
|
+
end
|
102
|
+
|
103
|
+
match.team = teams_name[0]
|
104
|
+
match.opponent = teams_name[1]
|
105
|
+
end
|
106
|
+
|
107
|
+
def scrap_start_at(match, match_element)
|
108
|
+
match.start_at = find_start_at_helper(match_element) unless match.start_at
|
109
|
+
datetime = date_with_start_at_helper(match.date, match.start_at)
|
110
|
+
match.date = datetime if datetime
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class RankingsBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include RankingsHelper
|
7
|
+
include Formattable
|
8
|
+
include Printable
|
9
|
+
|
10
|
+
delegate [:each] => :@rankings
|
11
|
+
|
12
|
+
def initialize(document, championship)
|
13
|
+
@championship = championship
|
14
|
+
@rankings = []
|
15
|
+
@header = []
|
16
|
+
@rows = []
|
17
|
+
|
18
|
+
tables = document.css('table')
|
19
|
+
table = find_table_by_header(tables, Ranking::TABLE_HEADER)
|
20
|
+
|
21
|
+
return unless table
|
22
|
+
|
23
|
+
scrap_rankings(table)
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_h
|
27
|
+
@rankings.map(&:to_h)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def scrap_rankings(table)
|
33
|
+
scrap_header(table)
|
34
|
+
scrap_body(table)
|
35
|
+
create_rankings_from_table
|
36
|
+
end
|
37
|
+
|
38
|
+
def scrap_header(table)
|
39
|
+
table.css('thead > tr > th').each do |th|
|
40
|
+
text = th.element? && remove_whitespace(th)
|
41
|
+
next unless text
|
42
|
+
|
43
|
+
title = title_or_nil_helper(th)
|
44
|
+
|
45
|
+
@header << HeaderColumn.new(text, title)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def scrap_body(table)
|
50
|
+
table.css('tbody > tr').each do |tr_element|
|
51
|
+
next if tr_element.element? && element_hidden?(tr_element)
|
52
|
+
|
53
|
+
row = Row.new
|
54
|
+
tr_element.children.each do |td_element|
|
55
|
+
text = td_element.element? && remove_whitespace(td_element)
|
56
|
+
|
57
|
+
next unless text
|
58
|
+
|
59
|
+
text = scrap_position_if_exist(text)
|
60
|
+
|
61
|
+
team = scrap_team_name_if_exist(td_element)
|
62
|
+
|
63
|
+
# First cell (e.g posicao: 7º and team: Fluminense)
|
64
|
+
if text && !text.empty? && team && !team.empty?
|
65
|
+
row.cells << Cell.new(text)
|
66
|
+
row.cells << Cell.new(team)
|
67
|
+
elsif team && !team.empty?
|
68
|
+
row.cells << Cell.new(team)
|
69
|
+
else
|
70
|
+
row.cells << Cell.new(text)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add 1 to header length because on first cell we scrap 2 values
|
75
|
+
row_length = row.cells.length
|
76
|
+
header_length = @header.length + 1
|
77
|
+
unless row_length == header_length
|
78
|
+
raise RowSizeError.new(row_length, header_length)
|
79
|
+
end
|
80
|
+
|
81
|
+
@rows << row
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def scrap_position_if_exist(text)
|
86
|
+
if text&.match?(/^\d{1,2}º/i)
|
87
|
+
position = text[/^\d{1,2}º/i].strip
|
88
|
+
return position.delete 'º'
|
89
|
+
end
|
90
|
+
|
91
|
+
text
|
92
|
+
end
|
93
|
+
|
94
|
+
def scrap_team_name_if_exist(element)
|
95
|
+
title = title_or_nil_helper(element)
|
96
|
+
|
97
|
+
return unless title&.match?(/^[a-záàâãéèêíïóôõöúç\s\-]+ - [a-z]{2}$/i)
|
98
|
+
|
99
|
+
title[/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
|
100
|
+
end
|
101
|
+
|
102
|
+
def create_rankings_from_table
|
103
|
+
@rows.each do |row|
|
104
|
+
ranking = Ranking.new
|
105
|
+
ranking.championship = @championship.year
|
106
|
+
ranking.serie = @championship.serie
|
107
|
+
|
108
|
+
attrs_rank = Ranking::ATTRS_RANK
|
109
|
+
|
110
|
+
row.cells.each_with_index do |cell, idx|
|
111
|
+
ranking.send "#{attrs_rank[idx]}=", cell.value
|
112
|
+
end
|
113
|
+
|
114
|
+
@rankings << ranking
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class RoundsBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include Formattable
|
7
|
+
include Printable
|
8
|
+
|
9
|
+
delegate [:each] => :@rounds
|
10
|
+
|
11
|
+
def initialize(document, championship)
|
12
|
+
@championship = championship
|
13
|
+
@rounds = []
|
14
|
+
|
15
|
+
scrap_rounds(document)
|
16
|
+
end
|
17
|
+
|
18
|
+
def matches_builder
|
19
|
+
matches = @rounds.reduce([]) do |arr, round|
|
20
|
+
matches_per_round = round.matches
|
21
|
+
arr.push(*matches_per_round.all)
|
22
|
+
end
|
23
|
+
|
24
|
+
MatchesBuilder.new(matches)
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_h
|
28
|
+
@rounds.map(&:to_h)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def scrap_rounds(rounds_elements)
|
34
|
+
(0..37).each do |round_number|
|
35
|
+
round_element = rounds_elements.css(
|
36
|
+
"div[data-slide-index=#{round_number}]"
|
37
|
+
)
|
38
|
+
|
39
|
+
round_element.children.each do |element|
|
40
|
+
next unless element.element? && element.name == 'div'
|
41
|
+
|
42
|
+
round = scrap_round(element, round_number)
|
43
|
+
|
44
|
+
@rounds << round
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def scrap_round(round_element, round_number)
|
50
|
+
round = Round.new
|
51
|
+
round.championship = @championship.year
|
52
|
+
round.serie = @championship.serie
|
53
|
+
|
54
|
+
# Because index starts on zero, we add 1 for matching with Rounds ID
|
55
|
+
round.number = round_number + 1
|
56
|
+
scrap_matches(round, round_element)
|
57
|
+
|
58
|
+
round
|
59
|
+
end
|
60
|
+
|
61
|
+
def scrap_matches(round, round_element)
|
62
|
+
round_element.children.each do |element|
|
63
|
+
# matches are founded on <ul>
|
64
|
+
next unless element.element? && element.name == 'ul'
|
65
|
+
|
66
|
+
round.matches = MatchesPerRoundBuilder.new(
|
67
|
+
element,
|
68
|
+
round.number,
|
69
|
+
@championship
|
70
|
+
)
|
71
|
+
end
|
72
|
+
round
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class TeamsBuilder
|
5
|
+
extend Forwardable
|
6
|
+
include TeamsHelper
|
7
|
+
include Formattable
|
8
|
+
include Printable
|
9
|
+
|
10
|
+
delegate [:each] => :@teams
|
11
|
+
|
12
|
+
def initialize(document)
|
13
|
+
@teams = []
|
14
|
+
|
15
|
+
tables = document.css('table')
|
16
|
+
table = find_table_by_header(
|
17
|
+
tables,
|
18
|
+
Ranking::TABLE_HEADER
|
19
|
+
)
|
20
|
+
|
21
|
+
scrap_teams(table)
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_h
|
25
|
+
@teams.map(&:to_h)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def scrap_teams(table)
|
31
|
+
table.css('tbody > tr').each do |tr_element|
|
32
|
+
# Remove the rows that are invisible by default
|
33
|
+
next if tr_element.element? && element_hidden?(tr_element)
|
34
|
+
|
35
|
+
teams_elements = tr_element.css('img')
|
36
|
+
|
37
|
+
# two teams are found in a row: team and next opponent
|
38
|
+
# the last one may be not present
|
39
|
+
unless teams_elements.length >= 1 && teams_elements.length <= 2
|
40
|
+
raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
|
41
|
+
end
|
42
|
+
|
43
|
+
# only the first team is scraped
|
44
|
+
team_element = teams_elements.first
|
45
|
+
|
46
|
+
@teams << scrap_team(team_element)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def scrap_team(team_element)
|
51
|
+
team = Team.new
|
52
|
+
|
53
|
+
if team_element&.key?('title') &&
|
54
|
+
team_element['title'].match?(/^[a-záàâãéèêíïóôõöúç\s]+ - [a-z]{2}$/i)
|
55
|
+
|
56
|
+
scrap_name(team, team_element)
|
57
|
+
scrap_state(team, team_element)
|
58
|
+
scrap_avatar_url(team, team_element)
|
59
|
+
end
|
60
|
+
|
61
|
+
team
|
62
|
+
end
|
63
|
+
|
64
|
+
def scrap_name(team, team_element)
|
65
|
+
# e.g "Santos"
|
66
|
+
team.name = team_element['title'][/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
|
67
|
+
end
|
68
|
+
|
69
|
+
def scrap_state(team, team_element)
|
70
|
+
# e.g "SP"
|
71
|
+
team.state = team_element['title'][/[a-z]{2}$/i]
|
72
|
+
end
|
73
|
+
|
74
|
+
def scrap_avatar_url(team, team_element)
|
75
|
+
team.avatar_url = team_element['src'] if team_element.key?('src')
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|