scrap_cbf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 468506451d0e8869e39a079ddaacf33a8aacc1e2a79052cdb251611217874c0d
4
+ data.tar.gz: 05a8c5cc7e71b8323ee4e3bd73ae723519a24ffe357e392910c044186496b586
5
+ SHA512:
6
+ metadata.gz: ff2bad7d68f58a68d8499c34f175bac2231616f41c99190fa91f80e0a8ed1ecf59fe7717a81bc3a86cefe73fdefaa52028248a6f5f7456a4f40ad85ef1e468b4
7
+ data.tar.gz: 01bc568cdca4a5ccd530bdafd42706b45724f5cd1f6bb01db23182ea8f9ea74eba4e031f7c0603fcefadfba900b6643616486e8499c354c13ba578760096da83
data/lib/scrap_cbf.rb ADDED
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'json'
6
+ require 'active_support/core_ext/hash/indifferent_access'
7
+ require 'forwardable'
8
+
9
+ require_relative 'scrap_cbf/errors'
10
+ require_relative 'scrap_cbf/formattable'
11
+ require_relative 'scrap_cbf/printable'
12
+ require_relative 'scrap_cbf/document'
13
+ require_relative 'scrap_cbf/models/table/header_column'
14
+ require_relative 'scrap_cbf/models/table/row'
15
+ require_relative 'scrap_cbf/models/table/cell'
16
+ require_relative 'scrap_cbf/models/championship'
17
+ require_relative 'scrap_cbf/models/team'
18
+ require_relative 'scrap_cbf/models/round'
19
+ require_relative 'scrap_cbf/models/match'
20
+ require_relative 'scrap_cbf/models/ranking'
21
+ require_relative 'scrap_cbf/helpers/matches_helper'
22
+ require_relative 'scrap_cbf/helpers/rankings_helper'
23
+ require_relative 'scrap_cbf/helpers/teams_helper'
24
+ require_relative 'scrap_cbf/builders/matches_builder'
25
+ require_relative 'scrap_cbf/builders/matches_per_round_builder'
26
+ require_relative 'scrap_cbf/builders/rankings_builder'
27
+ require_relative 'scrap_cbf/builders/rounds_builder'
28
+ require_relative 'scrap_cbf/builders/teams_builder'
29
+
30
+ # ScrapCbf is a gem created for scraping data from the CBF official page.
31
+ # Some of the data found on the CBF page are:
32
+ # teams, matches, rounds and ranking table from all championships founded
33
+ # on the official page.
34
+ class ScrapCbf
35
+ include Formattable
36
+ include Printable
37
+
38
+ # @!attribute [r] document
39
+ # @return [ScrapCbf::Document] ScrapCbf::Document instance.
40
+ # @!attribute [r] championship
41
+ # @return [ScrapCbf::Championship] ScrapCbf::Championship instance.
42
+ attr_reader :document
43
+ attr_reader :championship
44
+
45
+ # @param [Hash] opts
46
+ # @option opts [Integer] :year The Championship year.
47
+ # @option opts [Symbol] :serie The Championship serie.
48
+ # @option opts [Symbol] :load_from_sample Load championship from sample.
49
+ # @option opts [Symbol] :sample_path to the sample otherwise default
50
+ #
51
+ # @return [ScrapCbf] new instance
52
+ def initialize(opts = {})
53
+ year = opts.fetch(:year, Date.today.year.to_i)
54
+ serie = opts.fetch(:serie, :serie_a)
55
+
56
+ @document = Document.new(year, serie, opts)
57
+ @parsed_document = @document.parsed_document
58
+ @championship = Championship.new(year, serie)
59
+ end
60
+
61
+ # returns all entities scraped in hash format.
62
+ def to_h
63
+ {
64
+ championship: championship.to_h,
65
+ matches: matches.to_h,
66
+ rankings: rankings.to_h,
67
+ rounds: rounds.to_h,
68
+ teams: teams.to_h
69
+
70
+ }.with_indifferent_access
71
+ end
72
+
73
+ # @return [MatchesBuilder] instance.
74
+ def matches
75
+ @matches ||= rounds.matches_builder
76
+ end
77
+
78
+ # @return [RankingsBuilder] instance.
79
+ def rankings
80
+ @rankings ||= RankingsBuilder.new(@parsed_document, @championship)
81
+ end
82
+
83
+ # @return [RoundsBuilder] instance.
84
+ def rounds
85
+ @rounds ||= RoundsBuilder.new(@parsed_document, @championship)
86
+ end
87
+
88
+ # @return [TeamsBuilder] instance.
89
+ def teams
90
+ @teams ||= TeamsBuilder.new(@parsed_document)
91
+ end
92
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class MatchesBuilder
5
+ extend Forwardable
6
+ include Formattable
7
+ include Printable
8
+
9
+ delegate %i[each map] => :@matches
10
+
11
+ attr_accessor :matches
12
+
13
+ def initialize(matches)
14
+ @matches = matches
15
+ end
16
+
17
+ def to_h
18
+ @matches.map(&:to_h)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class MatchesPerRoundBuilder
5
+ extend Forwardable
6
+ include MatchesHelper
7
+ include Formattable
8
+ include Printable
9
+
10
+ delegate %i[each map] => :@matches
11
+
12
+ attr_reader :matches
13
+ alias all matches
14
+
15
+ def initialize(matches_elements, round_number, championship)
16
+ @championship = championship
17
+ @matches = []
18
+
19
+ scrap_matches(matches_elements, round_number)
20
+ end
21
+
22
+ def to_h
23
+ @matches.map(&:to_h)
24
+ end
25
+
26
+ private
27
+
28
+ def scrap_matches(matches_elements, round_number)
29
+ matches_elements.children.each do |match_element|
30
+ next unless match_element.element?
31
+
32
+ @matches << scrap_match(match_element, round_number)
33
+ end
34
+ end
35
+
36
+ def scrap_match(match_element, round_number)
37
+ match = Match.new
38
+ match.championship = @championship.year
39
+ match.serie = @championship.serie
40
+
41
+ match.round = round_number
42
+
43
+ # e.g "Qua, 03/02/2021 16:00 - Jogo: 336"
44
+ scrap_info(match, match_element)
45
+ # e.g <img title="team-name">
46
+ scrap_teams(match, match_element)
47
+ # e.g "1 alteração" (can be undefined)
48
+ scrap_update(match, match_element)
49
+ # e.g "16:00" (can be found in two places, we take only the first)
50
+ scrap_start_at(match, match_element)
51
+ # e.g "1 x 1" (can be undefined)
52
+ scrap_score(match, match_element)
53
+ # e.g "Vila Belmiro - Santos - SP" (can be undefined)
54
+ scrap_place(match, match_element)
55
+
56
+ match
57
+ end
58
+
59
+ def scrap_info(match, match_element)
60
+ info = find_info_helper(match_element)
61
+ return unless info
62
+
63
+ # e.g "Jogo: 336" (always defined)
64
+ match.id_match = info[/Jogo: \d{1,3}$/i].gsub(/^Jogo: /, '')
65
+ # e.g "03/02/2021" (can be undefined)
66
+ match.date = info[%r{\d{2}/\d{2}/\d{2,4}}i]
67
+ # e.g "16:00" (can be undefined)
68
+ match.start_at = info[/\d{2}:\d{2}/i]
69
+ end
70
+
71
+ def scrap_place(match, match_element)
72
+ match.place = find_place_helper(match_element)
73
+ end
74
+
75
+ def scrap_score(match, match_element)
76
+ score = find_score_helper(match_element)
77
+
78
+ match.team_score = score_by_team_helper(:team, score)
79
+ match.opponent_score = score_by_team_helper(:opponent, score)
80
+ end
81
+
82
+ def scrap_update(match, match_element)
83
+ match.updates = find_updates_helper(match_element)
84
+ end
85
+
86
+ def scrap_teams(match, match_element)
87
+ teams_elements = match_element.css('img')
88
+
89
+ unless teams_elements.length == 2
90
+ raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
91
+ end
92
+
93
+ teams_name = teams_elements.map do |team_element|
94
+ next unless team_element.element?
95
+
96
+ scrap_team_names_helper(team_element)
97
+ end
98
+
99
+ unless teams_name.length == 2
100
+ raise InvalidNumberOfEntitiesError.new(:team, teams_name.length)
101
+ end
102
+
103
+ match.team = teams_name[0]
104
+ match.opponent = teams_name[1]
105
+ end
106
+
107
+ def scrap_start_at(match, match_element)
108
+ match.start_at = find_start_at_helper(match_element) unless match.start_at
109
+ datetime = date_with_start_at_helper(match.date, match.start_at)
110
+ match.date = datetime if datetime
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class RankingsBuilder
5
+ extend Forwardable
6
+ include RankingsHelper
7
+ include Formattable
8
+ include Printable
9
+
10
+ delegate [:each] => :@rankings
11
+
12
+ def initialize(document, championship)
13
+ @championship = championship
14
+ @rankings = []
15
+ @header = []
16
+ @rows = []
17
+
18
+ tables = document.css('table')
19
+ table = find_table_by_header(tables, Ranking::TABLE_HEADER)
20
+
21
+ return unless table
22
+
23
+ scrap_rankings(table)
24
+ end
25
+
26
+ def to_h
27
+ @rankings.map(&:to_h)
28
+ end
29
+
30
+ private
31
+
32
+ def scrap_rankings(table)
33
+ scrap_header(table)
34
+ scrap_body(table)
35
+ create_rankings_from_table
36
+ end
37
+
38
+ def scrap_header(table)
39
+ table.css('thead > tr > th').each do |th|
40
+ text = th.element? && remove_whitespace(th)
41
+ next unless text
42
+
43
+ title = title_or_nil_helper(th)
44
+
45
+ @header << HeaderColumn.new(text, title)
46
+ end
47
+ end
48
+
49
+ def scrap_body(table)
50
+ table.css('tbody > tr').each do |tr_element|
51
+ next if tr_element.element? && element_hidden?(tr_element)
52
+
53
+ row = Row.new
54
+ tr_element.children.each do |td_element|
55
+ text = td_element.element? && remove_whitespace(td_element)
56
+
57
+ next unless text
58
+
59
+ text = scrap_position_if_exist(text)
60
+
61
+ team = scrap_team_name_if_exist(td_element)
62
+
63
+ # First cell (e.g posicao: 7º and team: Fluminense)
64
+ if text && !text.empty? && team && !team.empty?
65
+ row.cells << Cell.new(text)
66
+ row.cells << Cell.new(team)
67
+ elsif team && !team.empty?
68
+ row.cells << Cell.new(team)
69
+ else
70
+ row.cells << Cell.new(text)
71
+ end
72
+ end
73
+
74
+ # Add 1 to header length because on first cell we scrap 2 values
75
+ row_length = row.cells.length
76
+ header_length = @header.length + 1
77
+ unless row_length == header_length
78
+ raise RowSizeError.new(row_length, header_length)
79
+ end
80
+
81
+ @rows << row
82
+ end
83
+ end
84
+
85
+ def scrap_position_if_exist(text)
86
+ if text&.match?(/^\d{1,2}º/i)
87
+ position = text[/^\d{1,2}º/i].strip
88
+ return position.delete 'º'
89
+ end
90
+
91
+ text
92
+ end
93
+
94
+ def scrap_team_name_if_exist(element)
95
+ title = title_or_nil_helper(element)
96
+
97
+ return unless title&.match?(/^[a-záàâãéèêíïóôõöúç\s\-]+ - [a-z]{2}$/i)
98
+
99
+ title[/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
100
+ end
101
+
102
+ def create_rankings_from_table
103
+ @rows.each do |row|
104
+ ranking = Ranking.new
105
+ ranking.championship = @championship.year
106
+ ranking.serie = @championship.serie
107
+
108
+ attrs_rank = Ranking::ATTRS_RANK
109
+
110
+ row.cells.each_with_index do |cell, idx|
111
+ ranking.send "#{attrs_rank[idx]}=", cell.value
112
+ end
113
+
114
+ @rankings << ranking
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class RoundsBuilder
5
+ extend Forwardable
6
+ include Formattable
7
+ include Printable
8
+
9
+ delegate [:each] => :@rounds
10
+
11
+ def initialize(document, championship)
12
+ @championship = championship
13
+ @rounds = []
14
+
15
+ scrap_rounds(document)
16
+ end
17
+
18
+ def matches_builder
19
+ matches = @rounds.reduce([]) do |arr, round|
20
+ matches_per_round = round.matches
21
+ arr.push(*matches_per_round.all)
22
+ end
23
+
24
+ MatchesBuilder.new(matches)
25
+ end
26
+
27
+ def to_h
28
+ @rounds.map(&:to_h)
29
+ end
30
+
31
+ private
32
+
33
+ def scrap_rounds(rounds_elements)
34
+ (0..37).each do |round_number|
35
+ round_element = rounds_elements.css(
36
+ "div[data-slide-index=#{round_number}]"
37
+ )
38
+
39
+ round_element.children.each do |element|
40
+ next unless element.element? && element.name == 'div'
41
+
42
+ round = scrap_round(element, round_number)
43
+
44
+ @rounds << round
45
+ end
46
+ end
47
+ end
48
+
49
+ def scrap_round(round_element, round_number)
50
+ round = Round.new
51
+ round.championship = @championship.year
52
+ round.serie = @championship.serie
53
+
54
+ # Because index starts on zero, we add 1 for matching with Rounds ID
55
+ round.number = round_number + 1
56
+ scrap_matches(round, round_element)
57
+
58
+ round
59
+ end
60
+
61
+ def scrap_matches(round, round_element)
62
+ round_element.children.each do |element|
63
+ # matches are founded on <ul>
64
+ next unless element.element? && element.name == 'ul'
65
+
66
+ round.matches = MatchesPerRoundBuilder.new(
67
+ element,
68
+ round.number,
69
+ @championship
70
+ )
71
+ end
72
+ round
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class TeamsBuilder
5
+ extend Forwardable
6
+ include TeamsHelper
7
+ include Formattable
8
+ include Printable
9
+
10
+ delegate [:each] => :@teams
11
+
12
+ def initialize(document)
13
+ @teams = []
14
+
15
+ tables = document.css('table')
16
+ table = find_table_by_header(
17
+ tables,
18
+ Ranking::TABLE_HEADER
19
+ )
20
+
21
+ scrap_teams(table)
22
+ end
23
+
24
+ def to_h
25
+ @teams.map(&:to_h)
26
+ end
27
+
28
+ private
29
+
30
+ def scrap_teams(table)
31
+ table.css('tbody > tr').each do |tr_element|
32
+ # Remove the rows that are invisible by default
33
+ next if tr_element.element? && element_hidden?(tr_element)
34
+
35
+ teams_elements = tr_element.css('img')
36
+
37
+ # two teams are found in a row: team and next opponent
38
+ # the last one may be not present
39
+ unless teams_elements.length >= 1 && teams_elements.length <= 2
40
+ raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
41
+ end
42
+
43
+ # only the first team is scraped
44
+ team_element = teams_elements.first
45
+
46
+ @teams << scrap_team(team_element)
47
+ end
48
+ end
49
+
50
+ def scrap_team(team_element)
51
+ team = Team.new
52
+
53
+ if team_element&.key?('title') &&
54
+ team_element['title'].match?(/^[a-záàâãéèêíïóôõöúç\s]+ - [a-z]{2}$/i)
55
+
56
+ scrap_name(team, team_element)
57
+ scrap_state(team, team_element)
58
+ scrap_avatar_url(team, team_element)
59
+ end
60
+
61
+ team
62
+ end
63
+
64
+ def scrap_name(team, team_element)
65
+ # e.g "Santos"
66
+ team.name = team_element['title'][/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
67
+ end
68
+
69
+ def scrap_state(team, team_element)
70
+ # e.g "SP"
71
+ team.state = team_element['title'][/[a-z]{2}$/i]
72
+ end
73
+
74
+ def scrap_avatar_url(team, team_element)
75
+ team.avatar_url = team_element['src'] if team_element.key?('src')
76
+ end
77
+ end
78
+ end