scrap_cbf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 468506451d0e8869e39a079ddaacf33a8aacc1e2a79052cdb251611217874c0d
4
+ data.tar.gz: 05a8c5cc7e71b8323ee4e3bd73ae723519a24ffe357e392910c044186496b586
5
+ SHA512:
6
+ metadata.gz: ff2bad7d68f58a68d8499c34f175bac2231616f41c99190fa91f80e0a8ed1ecf59fe7717a81bc3a86cefe73fdefaa52028248a6f5f7456a4f40ad85ef1e468b4
7
+ data.tar.gz: 01bc568cdca4a5ccd530bdafd42706b45724f5cd1f6bb01db23182ea8f9ea74eba4e031f7c0603fcefadfba900b6643616486e8499c354c13ba578760096da83
data/lib/scrap_cbf.rb ADDED
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'json'
6
+ require 'active_support/core_ext/hash/indifferent_access'
7
+ require 'forwardable'
8
+
9
+ require_relative 'scrap_cbf/errors'
10
+ require_relative 'scrap_cbf/formattable'
11
+ require_relative 'scrap_cbf/printable'
12
+ require_relative 'scrap_cbf/document'
13
+ require_relative 'scrap_cbf/models/table/header_column'
14
+ require_relative 'scrap_cbf/models/table/row'
15
+ require_relative 'scrap_cbf/models/table/cell'
16
+ require_relative 'scrap_cbf/models/championship'
17
+ require_relative 'scrap_cbf/models/team'
18
+ require_relative 'scrap_cbf/models/round'
19
+ require_relative 'scrap_cbf/models/match'
20
+ require_relative 'scrap_cbf/models/ranking'
21
+ require_relative 'scrap_cbf/helpers/matches_helper'
22
+ require_relative 'scrap_cbf/helpers/rankings_helper'
23
+ require_relative 'scrap_cbf/helpers/teams_helper'
24
+ require_relative 'scrap_cbf/builders/matches_builder'
25
+ require_relative 'scrap_cbf/builders/matches_per_round_builder'
26
+ require_relative 'scrap_cbf/builders/rankings_builder'
27
+ require_relative 'scrap_cbf/builders/rounds_builder'
28
+ require_relative 'scrap_cbf/builders/teams_builder'
29
+
30
+ # ScrapCbf is a gem created for scraping data from the CBF official page.
31
+ # Some of the data found on the CBF page are:
32
+ # teams, matches, rounds and ranking table from all championships founded
33
+ # on the official page.
34
+ class ScrapCbf
35
+ include Formattable
36
+ include Printable
37
+
38
+ # @!attribute [r] document
39
+ # @return [ScrapCbf::Document] ScrapCbf::Document instance.
40
+ # @!attribute [r] championship
41
+ # @return [ScrapCbf::Championship] ScrapCbf::Championship instance.
42
+ attr_reader :document
43
+ attr_reader :championship
44
+
45
+ # @param [Hash] opts
46
+ # @option opts [Integer] :year The Championship year.
47
+ # @option opts [Symbol] :serie The Championship serie.
48
+ # @option opts [Symbol] :load_from_sample Load championship from sample.
49
+ # @option opts [Symbol] :sample_path to the sample otherwise default
50
+ #
51
+ # @return [ScrapCbf] new instance
52
+ def initialize(opts = {})
53
+ year = opts.fetch(:year, Date.today.year.to_i)
54
+ serie = opts.fetch(:serie, :serie_a)
55
+
56
+ @document = Document.new(year, serie, opts)
57
+ @parsed_document = @document.parsed_document
58
+ @championship = Championship.new(year, serie)
59
+ end
60
+
61
+ # returns all entities scraped in hash format.
62
+ def to_h
63
+ {
64
+ championship: championship.to_h,
65
+ matches: matches.to_h,
66
+ rankings: rankings.to_h,
67
+ rounds: rounds.to_h,
68
+ teams: teams.to_h
69
+
70
+ }.with_indifferent_access
71
+ end
72
+
73
+ # @return [MatchesBuilder] instance.
74
+ def matches
75
+ @matches ||= rounds.matches_builder
76
+ end
77
+
78
+ # @return [RankingsBuilder] instance.
79
+ def rankings
80
+ @rankings ||= RankingsBuilder.new(@parsed_document, @championship)
81
+ end
82
+
83
+ # @return [RoundsBuilder] instance.
84
+ def rounds
85
+ @rounds ||= RoundsBuilder.new(@parsed_document, @championship)
86
+ end
87
+
88
+ # @return [TeamsBuilder] instance.
89
+ def teams
90
+ @teams ||= TeamsBuilder.new(@parsed_document)
91
+ end
92
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class MatchesBuilder
5
+ extend Forwardable
6
+ include Formattable
7
+ include Printable
8
+
9
+ delegate %i[each map] => :@matches
10
+
11
+ attr_accessor :matches
12
+
13
+ def initialize(matches)
14
+ @matches = matches
15
+ end
16
+
17
+ def to_h
18
+ @matches.map(&:to_h)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class MatchesPerRoundBuilder
5
+ extend Forwardable
6
+ include MatchesHelper
7
+ include Formattable
8
+ include Printable
9
+
10
+ delegate %i[each map] => :@matches
11
+
12
+ attr_reader :matches
13
+ alias all matches
14
+
15
+ def initialize(matches_elements, round_number, championship)
16
+ @championship = championship
17
+ @matches = []
18
+
19
+ scrap_matches(matches_elements, round_number)
20
+ end
21
+
22
+ def to_h
23
+ @matches.map(&:to_h)
24
+ end
25
+
26
+ private
27
+
28
+ def scrap_matches(matches_elements, round_number)
29
+ matches_elements.children.each do |match_element|
30
+ next unless match_element.element?
31
+
32
+ @matches << scrap_match(match_element, round_number)
33
+ end
34
+ end
35
+
36
+ def scrap_match(match_element, round_number)
37
+ match = Match.new
38
+ match.championship = @championship.year
39
+ match.serie = @championship.serie
40
+
41
+ match.round = round_number
42
+
43
+ # e.g "Qua, 03/02/2021 16:00 - Jogo: 336"
44
+ scrap_info(match, match_element)
45
+ # e.g <img title="team-name">
46
+ scrap_teams(match, match_element)
47
+ # e.g "1 alteração" (can be undefined)
48
+ scrap_update(match, match_element)
49
+ # e.g "16:00" (can be found in two places, we take only the first)
50
+ scrap_start_at(match, match_element)
51
+ # e.g "1 x 1" (can be undefined)
52
+ scrap_score(match, match_element)
53
+ # e.g "Vila Belmiro - Santos - SP" (can be undefined)
54
+ scrap_place(match, match_element)
55
+
56
+ match
57
+ end
58
+
59
+ def scrap_info(match, match_element)
60
+ info = find_info_helper(match_element)
61
+ return unless info
62
+
63
+ # e.g "Jogo: 336" (always defined)
64
+ match.id_match = info[/Jogo: \d{1,3}$/i].gsub(/^Jogo: /, '')
65
+ # e.g "03/02/2021" (can be undefined)
66
+ match.date = info[%r{\d{2}/\d{2}/\d{2,4}}i]
67
+ # e.g "16:00" (can be undefined)
68
+ match.start_at = info[/\d{2}:\d{2}/i]
69
+ end
70
+
71
+ def scrap_place(match, match_element)
72
+ match.place = find_place_helper(match_element)
73
+ end
74
+
75
+ def scrap_score(match, match_element)
76
+ score = find_score_helper(match_element)
77
+
78
+ match.team_score = score_by_team_helper(:team, score)
79
+ match.opponent_score = score_by_team_helper(:opponent, score)
80
+ end
81
+
82
+ def scrap_update(match, match_element)
83
+ match.updates = find_updates_helper(match_element)
84
+ end
85
+
86
+ def scrap_teams(match, match_element)
87
+ teams_elements = match_element.css('img')
88
+
89
+ unless teams_elements.length == 2
90
+ raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
91
+ end
92
+
93
+ teams_name = teams_elements.map do |team_element|
94
+ next unless team_element.element?
95
+
96
+ scrap_team_names_helper(team_element)
97
+ end
98
+
99
+ unless teams_name.length == 2
100
+ raise InvalidNumberOfEntitiesError.new(:team, teams_name.length)
101
+ end
102
+
103
+ match.team = teams_name[0]
104
+ match.opponent = teams_name[1]
105
+ end
106
+
107
+ def scrap_start_at(match, match_element)
108
+ match.start_at = find_start_at_helper(match_element) unless match.start_at
109
+ datetime = date_with_start_at_helper(match.date, match.start_at)
110
+ match.date = datetime if datetime
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class RankingsBuilder
5
+ extend Forwardable
6
+ include RankingsHelper
7
+ include Formattable
8
+ include Printable
9
+
10
+ delegate [:each] => :@rankings
11
+
12
+ def initialize(document, championship)
13
+ @championship = championship
14
+ @rankings = []
15
+ @header = []
16
+ @rows = []
17
+
18
+ tables = document.css('table')
19
+ table = find_table_by_header(tables, Ranking::TABLE_HEADER)
20
+
21
+ return unless table
22
+
23
+ scrap_rankings(table)
24
+ end
25
+
26
+ def to_h
27
+ @rankings.map(&:to_h)
28
+ end
29
+
30
+ private
31
+
32
+ def scrap_rankings(table)
33
+ scrap_header(table)
34
+ scrap_body(table)
35
+ create_rankings_from_table
36
+ end
37
+
38
+ def scrap_header(table)
39
+ table.css('thead > tr > th').each do |th|
40
+ text = th.element? && remove_whitespace(th)
41
+ next unless text
42
+
43
+ title = title_or_nil_helper(th)
44
+
45
+ @header << HeaderColumn.new(text, title)
46
+ end
47
+ end
48
+
49
+ def scrap_body(table)
50
+ table.css('tbody > tr').each do |tr_element|
51
+ next if tr_element.element? && element_hidden?(tr_element)
52
+
53
+ row = Row.new
54
+ tr_element.children.each do |td_element|
55
+ text = td_element.element? && remove_whitespace(td_element)
56
+
57
+ next unless text
58
+
59
+ text = scrap_position_if_exist(text)
60
+
61
+ team = scrap_team_name_if_exist(td_element)
62
+
63
+ # First cell (e.g posicao: 7º and team: Fluminense)
64
+ if text && !text.empty? && team && !team.empty?
65
+ row.cells << Cell.new(text)
66
+ row.cells << Cell.new(team)
67
+ elsif team && !team.empty?
68
+ row.cells << Cell.new(team)
69
+ else
70
+ row.cells << Cell.new(text)
71
+ end
72
+ end
73
+
74
+ # Add 1 to header length because on first cell we scrap 2 values
75
+ row_length = row.cells.length
76
+ header_length = @header.length + 1
77
+ unless row_length == header_length
78
+ raise RowSizeError.new(row_length, header_length)
79
+ end
80
+
81
+ @rows << row
82
+ end
83
+ end
84
+
85
+ def scrap_position_if_exist(text)
86
+ if text&.match?(/^\d{1,2}º/i)
87
+ position = text[/^\d{1,2}º/i].strip
88
+ return position.delete 'º'
89
+ end
90
+
91
+ text
92
+ end
93
+
94
+ def scrap_team_name_if_exist(element)
95
+ title = title_or_nil_helper(element)
96
+
97
+ return unless title&.match?(/^[a-záàâãéèêíïóôõöúç\s\-]+ - [a-z]{2}$/i)
98
+
99
+ title[/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
100
+ end
101
+
102
+ def create_rankings_from_table
103
+ @rows.each do |row|
104
+ ranking = Ranking.new
105
+ ranking.championship = @championship.year
106
+ ranking.serie = @championship.serie
107
+
108
+ attrs_rank = Ranking::ATTRS_RANK
109
+
110
+ row.cells.each_with_index do |cell, idx|
111
+ ranking.send "#{attrs_rank[idx]}=", cell.value
112
+ end
113
+
114
+ @rankings << ranking
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class RoundsBuilder
5
+ extend Forwardable
6
+ include Formattable
7
+ include Printable
8
+
9
+ delegate [:each] => :@rounds
10
+
11
+ def initialize(document, championship)
12
+ @championship = championship
13
+ @rounds = []
14
+
15
+ scrap_rounds(document)
16
+ end
17
+
18
+ def matches_builder
19
+ matches = @rounds.reduce([]) do |arr, round|
20
+ matches_per_round = round.matches
21
+ arr.push(*matches_per_round.all)
22
+ end
23
+
24
+ MatchesBuilder.new(matches)
25
+ end
26
+
27
+ def to_h
28
+ @rounds.map(&:to_h)
29
+ end
30
+
31
+ private
32
+
33
+ def scrap_rounds(rounds_elements)
34
+ (0..37).each do |round_number|
35
+ round_element = rounds_elements.css(
36
+ "div[data-slide-index=#{round_number}]"
37
+ )
38
+
39
+ round_element.children.each do |element|
40
+ next unless element.element? && element.name == 'div'
41
+
42
+ round = scrap_round(element, round_number)
43
+
44
+ @rounds << round
45
+ end
46
+ end
47
+ end
48
+
49
+ def scrap_round(round_element, round_number)
50
+ round = Round.new
51
+ round.championship = @championship.year
52
+ round.serie = @championship.serie
53
+
54
+ # Because index starts on zero, we add 1 for matching with Rounds ID
55
+ round.number = round_number + 1
56
+ scrap_matches(round, round_element)
57
+
58
+ round
59
+ end
60
+
61
+ def scrap_matches(round, round_element)
62
+ round_element.children.each do |element|
63
+ # matches are founded on <ul>
64
+ next unless element.element? && element.name == 'ul'
65
+
66
+ round.matches = MatchesPerRoundBuilder.new(
67
+ element,
68
+ round.number,
69
+ @championship
70
+ )
71
+ end
72
+ round
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class TeamsBuilder
5
+ extend Forwardable
6
+ include TeamsHelper
7
+ include Formattable
8
+ include Printable
9
+
10
+ delegate [:each] => :@teams
11
+
12
+ def initialize(document)
13
+ @teams = []
14
+
15
+ tables = document.css('table')
16
+ table = find_table_by_header(
17
+ tables,
18
+ Ranking::TABLE_HEADER
19
+ )
20
+
21
+ scrap_teams(table)
22
+ end
23
+
24
+ def to_h
25
+ @teams.map(&:to_h)
26
+ end
27
+
28
+ private
29
+
30
+ def scrap_teams(table)
31
+ table.css('tbody > tr').each do |tr_element|
32
+ # Remove the rows that are invisible by default
33
+ next if tr_element.element? && element_hidden?(tr_element)
34
+
35
+ teams_elements = tr_element.css('img')
36
+
37
+ # two teams are found in a row: team and next opponent
38
+ # the last one may be not present
39
+ unless teams_elements.length >= 1 && teams_elements.length <= 2
40
+ raise InvalidNumberOfEntitiesError.new(:team, teams_elements.length)
41
+ end
42
+
43
+ # only the first team is scraped
44
+ team_element = teams_elements.first
45
+
46
+ @teams << scrap_team(team_element)
47
+ end
48
+ end
49
+
50
+ def scrap_team(team_element)
51
+ team = Team.new
52
+
53
+ if team_element&.key?('title') &&
54
+ team_element['title'].match?(/^[a-záàâãéèêíïóôõöúç\s]+ - [a-z]{2}$/i)
55
+
56
+ scrap_name(team, team_element)
57
+ scrap_state(team, team_element)
58
+ scrap_avatar_url(team, team_element)
59
+ end
60
+
61
+ team
62
+ end
63
+
64
+ def scrap_name(team, team_element)
65
+ # e.g "Santos"
66
+ team.name = team_element['title'][/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
67
+ end
68
+
69
+ def scrap_state(team, team_element)
70
+ # e.g "SP"
71
+ team.state = team_element['title'][/[a-z]{2}$/i]
72
+ end
73
+
74
+ def scrap_avatar_url(team, team_element)
75
+ team.avatar_url = team_element['src'] if team_element.key?('src')
76
+ end
77
+ end
78
+ end