scrap_cbf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/scrap_cbf.rb +92 -0
- data/lib/scrap_cbf/builders/matches_builder.rb +21 -0
- data/lib/scrap_cbf/builders/matches_per_round_builder.rb +113 -0
- data/lib/scrap_cbf/builders/rankings_builder.rb +118 -0
- data/lib/scrap_cbf/builders/rounds_builder.rb +75 -0
- data/lib/scrap_cbf/builders/teams_builder.rb +78 -0
- data/lib/scrap_cbf/document.rb +93 -0
- data/lib/scrap_cbf/errors.rb +51 -0
- data/lib/scrap_cbf/formattable.rb +16 -0
- data/lib/scrap_cbf/helpers/lib/depth_search.rb +37 -0
- data/lib/scrap_cbf/helpers/lib/element_nokogiri.rb +17 -0
- data/lib/scrap_cbf/helpers/lib/findable.rb +33 -0
- data/lib/scrap_cbf/helpers/matches_helper.rb +136 -0
- data/lib/scrap_cbf/helpers/rankings_helper.rb +19 -0
- data/lib/scrap_cbf/helpers/teams_helper.rb +11 -0
- data/lib/scrap_cbf/models/championship.rb +26 -0
- data/lib/scrap_cbf/models/match.rb +38 -0
- data/lib/scrap_cbf/models/ranking.rb +45 -0
- data/lib/scrap_cbf/models/round.rb +33 -0
- data/lib/scrap_cbf/models/table/cell.rb +10 -0
- data/lib/scrap_cbf/models/table/header_column.rb +11 -0
- data/lib/scrap_cbf/models/table/row.rb +10 -0
- data/lib/scrap_cbf/models/team.rb +19 -0
- data/lib/scrap_cbf/printable.rb +19 -0
- data/lib/scrap_cbf/samples/cbf_serie_a_2020.html +17330 -0
- data/lib/scrap_cbf/version.rb +5 -0
- metadata +193 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
# This class is responsible for:
|
5
|
+
# - Handler users input.
|
6
|
+
# - Fetch html page from CBF official page.
|
7
|
+
# - Parse html page with Nokogiri.
|
8
|
+
class Document
|
9
|
+
URL = 'https://www.cbf.com.br/futebol-brasileiro/competicoes'
|
10
|
+
|
11
|
+
CHAMPIONSHIP_YEARS = (2012..Date.today.year.to_i).to_a.freeze
|
12
|
+
|
13
|
+
SERIES = %i[serie_a serie_b].freeze
|
14
|
+
|
15
|
+
SERIES_PATH = {
|
16
|
+
serie_a: 'campeonato-brasileiro-serie-a',
|
17
|
+
serie_b: 'campeonato-brasileiro-serie-b'
|
18
|
+
}.freeze
|
19
|
+
|
20
|
+
SAMPLE_PATH = "#{File.dirname(__FILE__)}/samples/" \
|
21
|
+
'cbf_serie_a_2020.html'
|
22
|
+
|
23
|
+
class << self
|
24
|
+
def parse_document(year, serie, opts)
|
25
|
+
new(year, serie, opts).parsed_document
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :year,
|
30
|
+
:serie,
|
31
|
+
:load_from_sample,
|
32
|
+
:sample_path,
|
33
|
+
:parsed_document
|
34
|
+
|
35
|
+
# @param [Integer] year the Championship year
|
36
|
+
# @param [Symbol] serie the Championship serie. see SERIES.
|
37
|
+
# @option opts [Boolean] load_from_sample yes or no to load specific
|
38
|
+
# HTML file
|
39
|
+
# @option opts [Symbol] sample_path path to the sample otherwise default
|
40
|
+
#
|
41
|
+
# @return [Document] new instance
|
42
|
+
def initialize(year, serie, opts)
|
43
|
+
@year = year
|
44
|
+
@serie = serie
|
45
|
+
@load_from_sample = opts.fetch(:load_from_sample) { false }
|
46
|
+
@sample_path = opts[:sample_path]
|
47
|
+
|
48
|
+
@parsed_document =
|
49
|
+
parse_document(year, serie, @load_from_sample, @sample_path)
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
# @param [Integer] year the Championship year
|
55
|
+
# @param [Symbol] serie the Championship serie. see SERIES.
|
56
|
+
# @option opts [Boolean] load_from_sample yes or no to load specific
|
57
|
+
# HTML file
|
58
|
+
# @option opts [Symbol] sample_path path to the sample otherwise default
|
59
|
+
#
|
60
|
+
# @return [Nokogiri::HTML::Document] new instance
|
61
|
+
def parse_document(year, serie, load_from_sample, sample_path)
|
62
|
+
url = if load_from_sample
|
63
|
+
sample_path || SAMPLE_PATH
|
64
|
+
else
|
65
|
+
raise_year_error if year_out_of_range?(year)
|
66
|
+
raise_serie_error if serie_out_of_range?(serie)
|
67
|
+
|
68
|
+
build_url(year, serie)
|
69
|
+
end
|
70
|
+
Nokogiri::HTML(URI.open(url))
|
71
|
+
end
|
72
|
+
|
73
|
+
def build_url(year, serie)
|
74
|
+
"#{URL}/#{SERIES_PATH[serie]}/#{year}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def year_out_of_range?(year)
|
78
|
+
!CHAMPIONSHIP_YEARS.include?(year)
|
79
|
+
end
|
80
|
+
|
81
|
+
def serie_out_of_range?(serie)
|
82
|
+
!SERIES.include?(serie)
|
83
|
+
end
|
84
|
+
|
85
|
+
def raise_year_error
|
86
|
+
raise OutOfRangeArgumentError.new(:year, CHAMPIONSHIP_YEARS)
|
87
|
+
end
|
88
|
+
|
89
|
+
def raise_serie_error
|
90
|
+
raise OutOfRangeArgumentError.new(:serie, SERIES)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
# Base error for all ScrapCbf errors.
|
5
|
+
class BaseError < ::StandardError; end
|
6
|
+
|
7
|
+
# Raised when a required method is not implemented.
|
8
|
+
class MethodNotImplementedError < BaseError; end
|
9
|
+
|
10
|
+
# Raised when a argument is not included in a predefined range of values.
|
11
|
+
class OutOfRangeArgumentError < BaseError
|
12
|
+
def initialize(argument, range)
|
13
|
+
message = "#{argument} must be in the range of : #{range}"
|
14
|
+
super(message)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Raised when a argument is required but it is missing.
|
19
|
+
class MissingArgumentError < BaseError
|
20
|
+
def initialize(argument)
|
21
|
+
message = "missing argument: #{argument}"
|
22
|
+
super(message)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Raised when the scraping data is not included in a predefined range.
|
27
|
+
class InvalidNumberOfEntitiesError < BaseError
|
28
|
+
def initialize(entity, number)
|
29
|
+
message = "an invalid number of #{entity} entities was found: #{number}"
|
30
|
+
super(message)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Raised when the scraping data from a table
|
35
|
+
# have the size of the Table's header different than the Table's rows.
|
36
|
+
class RowSizeError < BaseError
|
37
|
+
def initialize(row_length, header_length)
|
38
|
+
message = "row length: #{row_length} doesn't match with " \
|
39
|
+
"header length: #{header_length}"
|
40
|
+
super(message)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Raised when a method is not found on a class.
|
45
|
+
class MethodMissingError < BaseError
|
46
|
+
def initialize(klass_name, method)
|
47
|
+
message = "method '#{method}' missing on class #{klass_name}"
|
48
|
+
super(message)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
# Format data in standarts outputs
|
5
|
+
module Formattable
|
6
|
+
# @return [Json]
|
7
|
+
def to_json(*_args)
|
8
|
+
to_h.to_json
|
9
|
+
end
|
10
|
+
|
11
|
+
# @return [Hash]
|
12
|
+
def to_h
|
13
|
+
raise MethodNotImplementedError
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
# This module use recursion and regex to find specifics methods on the dom.
|
5
|
+
# A better use case for this method is searching on a component view
|
6
|
+
# that has small text wrapped only on a html tag without a specific class
|
7
|
+
module DepthSearch
|
8
|
+
# Search element's children recursively and returns first assertion matched.
|
9
|
+
#
|
10
|
+
# The assertion must be passed through a proc, and must return nil for
|
11
|
+
# false assertions and the element searched for true assetions.
|
12
|
+
#
|
13
|
+
# @param element [Nokogiri::XML::Element] to be searched.
|
14
|
+
# @param proc [Proc] with logic to test assertion.
|
15
|
+
# @return [Object, nil] object may be text element or a dom element.
|
16
|
+
def depth_search(element, proc)
|
17
|
+
res = nil
|
18
|
+
counter = 0
|
19
|
+
number_of_children = element.children.length
|
20
|
+
|
21
|
+
while counter < number_of_children
|
22
|
+
|
23
|
+
child = element.children[counter]
|
24
|
+
res = proc.call(child)
|
25
|
+
|
26
|
+
return res if res # recursion base case #1 - return when found
|
27
|
+
|
28
|
+
res = depth_search(child, proc)
|
29
|
+
|
30
|
+
return res if res # recursion base case #2 - return from recursion
|
31
|
+
|
32
|
+
counter += 1
|
33
|
+
end
|
34
|
+
res
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
# This helper is a outliner. The purpose of this module is given a
|
5
|
+
# better interface for the object generate by Nokogiri.
|
6
|
+
# We can think as them as an extansion for the Nokogiri obj element.
|
7
|
+
# This module may disappear later.
|
8
|
+
module ElementNokogiri
|
9
|
+
def element_hidden?(element)
|
10
|
+
element['style']&.eql?('display: none')
|
11
|
+
end
|
12
|
+
|
13
|
+
def remove_whitespace(element)
|
14
|
+
element.text.gsub(/\s+/, '')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
# This module has methods for helping on the task of find specific component
|
5
|
+
# views (e.g table).
|
6
|
+
module Findable
|
7
|
+
# Find in the Document the first table with single header and level that
|
8
|
+
# matches with such accurancy the argument compare.
|
9
|
+
#
|
10
|
+
# This method uses Array#find to return object or nil.
|
11
|
+
#
|
12
|
+
# @return [Nokogiri::XML::Element, nil]
|
13
|
+
def find_table_by_header(
|
14
|
+
elems,
|
15
|
+
compare,
|
16
|
+
regex = '[[:alpha:]]+',
|
17
|
+
accuracy = 1.0
|
18
|
+
)
|
19
|
+
elems.find do |table|
|
20
|
+
# check only single level header
|
21
|
+
thead = table.css('thead').first
|
22
|
+
return false if !thead || thead.css('tr').length > 1
|
23
|
+
|
24
|
+
header = thead.text.scan(Regexp.new(regex))
|
25
|
+
|
26
|
+
return false if header.empty?
|
27
|
+
|
28
|
+
matches = (compare & header).length
|
29
|
+
(header.length / matches) >= accuracy
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/depth_search'
|
4
|
+
|
5
|
+
class ScrapCbf
|
6
|
+
module MatchesHelper
|
7
|
+
include DepthSearch
|
8
|
+
|
9
|
+
# e.g 0 x 1
|
10
|
+
def score_by_team_helper(team, score)
|
11
|
+
case team
|
12
|
+
when :team
|
13
|
+
score.split(' ')[0].to_i if score
|
14
|
+
when :opponent
|
15
|
+
score.split(' ')[2].to_i if score
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# 03/02/2021 16:00
|
20
|
+
def date_with_start_at_helper(date, start_at)
|
21
|
+
return unless date && start_at
|
22
|
+
|
23
|
+
"#{date} #{start_at}"
|
24
|
+
end
|
25
|
+
|
26
|
+
# teams are extract from <img>
|
27
|
+
def scrap_team_names_helper(team_element)
|
28
|
+
if team_element.key?('title') &&
|
29
|
+
team_element['title'].match?(/^[a-záàâãéèêíïóôõöúç\s]+ - [a-z]{2}$/i)
|
30
|
+
|
31
|
+
# Extract team's name (e.g Santos - SP => Santos)
|
32
|
+
team_element['title'][/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Because of problem passing regex, couldn't turn the 5 methods in 1.
|
37
|
+
#
|
38
|
+
# pass assertive Proc to depth_search
|
39
|
+
def find_info_helper(match)
|
40
|
+
find = proc do |element|
|
41
|
+
if element.text?
|
42
|
+
formatted_text = element.text.strip
|
43
|
+
unless formatted_text.empty?
|
44
|
+
res = formatted_text.match?(
|
45
|
+
/Jogo: \d{1,3}$/i
|
46
|
+
)
|
47
|
+
next formatted_text if res
|
48
|
+
end
|
49
|
+
end
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
|
53
|
+
depth_search(match, find)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Because of problem passing regex, couldn't turn the 5 methods in 1.
|
57
|
+
#
|
58
|
+
# pass assertive Proc to depth_search
|
59
|
+
def find_updates_helper(match)
|
60
|
+
find = proc do |element|
|
61
|
+
if element.text?
|
62
|
+
formatted_text = element.text.strip
|
63
|
+
unless formatted_text.empty?
|
64
|
+
res = formatted_text.match?(
|
65
|
+
/\d{1} (ALTERAÇÃO|ALTERAÇÕES)$/i
|
66
|
+
)
|
67
|
+
next formatted_text if res
|
68
|
+
end
|
69
|
+
end
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
depth_search(match, find)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Because of problem passing regex, couldn't turn the 5 methods in 1.
|
77
|
+
#
|
78
|
+
# pass assertive Proc to depth_search
|
79
|
+
def find_start_at_helper(match)
|
80
|
+
find = proc do |element|
|
81
|
+
if element.text?
|
82
|
+
formatted_text = element.text.strip
|
83
|
+
unless formatted_text.empty?
|
84
|
+
res = formatted_text.match?(
|
85
|
+
/^\d{2}:\d{2}$/i
|
86
|
+
)
|
87
|
+
next formatted_text if res
|
88
|
+
end
|
89
|
+
end
|
90
|
+
nil
|
91
|
+
end
|
92
|
+
|
93
|
+
depth_search(match, find)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Because of problem passing regex, couldn't turn the 5 methods in 1.
|
97
|
+
#
|
98
|
+
# pass assertive Proc to depth_search
|
99
|
+
def find_score_helper(match)
|
100
|
+
find = proc do |element|
|
101
|
+
if element.text?
|
102
|
+
formatted_text = element.text.strip
|
103
|
+
unless formatted_text.empty?
|
104
|
+
res = formatted_text.match?(
|
105
|
+
/^\d{1} x \d{1}$/i
|
106
|
+
)
|
107
|
+
next formatted_text if res
|
108
|
+
end
|
109
|
+
end
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
depth_search(match, find)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Because of problem passing regex, couldn't turn the 4 methods in 1.
|
117
|
+
#
|
118
|
+
# pass assertive Proc to depth_search
|
119
|
+
def find_place_helper(match)
|
120
|
+
find = proc do |element|
|
121
|
+
if element.text?
|
122
|
+
formatted_text = element.text.strip
|
123
|
+
unless formatted_text.empty?
|
124
|
+
res = formatted_text.match?(
|
125
|
+
/^[a-záàâãéèêíïóôõöúçñ\-\s]+ - [a-záàâãéèêíïóôõöúçñ\s\-]+ - [A-Z]{2}$/i
|
126
|
+
)
|
127
|
+
next formatted_text if res
|
128
|
+
end
|
129
|
+
end
|
130
|
+
nil
|
131
|
+
end
|
132
|
+
|
133
|
+
depth_search(match, find)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/element_nokogiri'
|
4
|
+
require_relative 'lib/findable'
|
5
|
+
|
6
|
+
class ScrapCbf
|
7
|
+
module RankingsHelper
|
8
|
+
include ElementNokogiri
|
9
|
+
include Findable
|
10
|
+
|
11
|
+
def title_or_nil_helper(element)
|
12
|
+
child_elem = element.children.find do |elem|
|
13
|
+
elem.element? && elem.key?('title')
|
14
|
+
end
|
15
|
+
|
16
|
+
child_elem['title'] if child_elem
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ScrapCbf
|
4
|
+
class Championship
|
5
|
+
include Formattable
|
6
|
+
include Printable
|
7
|
+
|
8
|
+
ATTR_ACCESSORS = %i[
|
9
|
+
year
|
10
|
+
serie
|
11
|
+
].freeze
|
12
|
+
|
13
|
+
attr_accessor(*ATTR_ACCESSORS)
|
14
|
+
|
15
|
+
def initialize(year, serie)
|
16
|
+
@year = year
|
17
|
+
@serie = serie
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h
|
21
|
+
ATTR_ACCESSORS.each_with_object({}) do |attribute, hash|
|
22
|
+
hash[attribute] = send attribute
|
23
|
+
end.with_indifferent_access
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|