scrap_cbf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ # This class is responsible for:
5
+ # - Handler users input.
6
+ # - Fetch html page from CBF official page.
7
+ # - Parse html page with Nokogiri.
8
+ class Document
9
+ URL = 'https://www.cbf.com.br/futebol-brasileiro/competicoes'
10
+
11
+ CHAMPIONSHIP_YEARS = (2012..Date.today.year.to_i).to_a.freeze
12
+
13
+ SERIES = %i[serie_a serie_b].freeze
14
+
15
+ SERIES_PATH = {
16
+ serie_a: 'campeonato-brasileiro-serie-a',
17
+ serie_b: 'campeonato-brasileiro-serie-b'
18
+ }.freeze
19
+
20
+ SAMPLE_PATH = "#{File.dirname(__FILE__)}/samples/" \
21
+ 'cbf_serie_a_2020.html'
22
+
23
+ class << self
24
+ def parse_document(year, serie, opts)
25
+ new(year, serie, opts).parsed_document
26
+ end
27
+ end
28
+
29
+ attr_reader :year,
30
+ :serie,
31
+ :load_from_sample,
32
+ :sample_path,
33
+ :parsed_document
34
+
35
+ # @param [Integer] year the Championship year
36
+ # @param [Symbol] serie the Championship serie. see SERIES.
37
+ # @option opts [Boolean] load_from_sample yes or no to load specific
38
+ # HTML file
39
+ # @option opts [Symbol] sample_path path to the sample otherwise default
40
+ #
41
+ # @return [Document] new instance
42
+ def initialize(year, serie, opts)
43
+ @year = year
44
+ @serie = serie
45
+ @load_from_sample = opts.fetch(:load_from_sample) { false }
46
+ @sample_path = opts[:sample_path]
47
+
48
+ @parsed_document =
49
+ parse_document(year, serie, @load_from_sample, @sample_path)
50
+ end
51
+
52
+ private
53
+
54
+ # @param [Integer] year the Championship year
55
+ # @param [Symbol] serie the Championship serie. see SERIES.
56
+ # @option opts [Boolean] load_from_sample yes or no to load specific
57
+ # HTML file
58
+ # @option opts [Symbol] sample_path path to the sample otherwise default
59
+ #
60
+ # @return [Nokogiri::HTML::Document] new instance
61
+ def parse_document(year, serie, load_from_sample, sample_path)
62
+ url = if load_from_sample
63
+ sample_path || SAMPLE_PATH
64
+ else
65
+ raise_year_error if year_out_of_range?(year)
66
+ raise_serie_error if serie_out_of_range?(serie)
67
+
68
+ build_url(year, serie)
69
+ end
70
+ Nokogiri::HTML(URI.open(url))
71
+ end
72
+
73
+ def build_url(year, serie)
74
+ "#{URL}/#{SERIES_PATH[serie]}/#{year}"
75
+ end
76
+
77
+ def year_out_of_range?(year)
78
+ !CHAMPIONSHIP_YEARS.include?(year)
79
+ end
80
+
81
+ def serie_out_of_range?(serie)
82
+ !SERIES.include?(serie)
83
+ end
84
+
85
+ def raise_year_error
86
+ raise OutOfRangeArgumentError.new(:year, CHAMPIONSHIP_YEARS)
87
+ end
88
+
89
+ def raise_serie_error
90
+ raise OutOfRangeArgumentError.new(:serie, SERIES)
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ # Base error for all ScrapCbf errors.
5
+ class BaseError < ::StandardError; end
6
+
7
+ # Raised when a required method is not implemented.
8
+ class MethodNotImplementedError < BaseError; end
9
+
10
+ # Raised when a argument is not included in a predefined range of values.
11
+ class OutOfRangeArgumentError < BaseError
12
+ def initialize(argument, range)
13
+ message = "#{argument} must be in the range of : #{range}"
14
+ super(message)
15
+ end
16
+ end
17
+
18
+ # Raised when a argument is required but it is missing.
19
+ class MissingArgumentError < BaseError
20
+ def initialize(argument)
21
+ message = "missing argument: #{argument}"
22
+ super(message)
23
+ end
24
+ end
25
+
26
+ # Raised when the scraping data is not included in a predefined range.
27
+ class InvalidNumberOfEntitiesError < BaseError
28
+ def initialize(entity, number)
29
+ message = "an invalid number of #{entity} entities was found: #{number}"
30
+ super(message)
31
+ end
32
+ end
33
+
34
+ # Raised when the scraping data from a table
35
+ # have the size of the Table's header different than the Table's rows.
36
+ class RowSizeError < BaseError
37
+ def initialize(row_length, header_length)
38
+ message = "row length: #{row_length} doesn't match with " \
39
+ "header length: #{header_length}"
40
+ super(message)
41
+ end
42
+ end
43
+
44
+ # Raised when a method is not found on a class.
45
+ class MethodMissingError < BaseError
46
+ def initialize(klass_name, method)
47
+ message = "method '#{method}' missing on class #{klass_name}"
48
+ super(message)
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ # Format data in standarts outputs
5
+ module Formattable
6
+ # @return [Json]
7
+ def to_json(*_args)
8
+ to_h.to_json
9
+ end
10
+
11
+ # @return [Hash]
12
+ def to_h
13
+ raise MethodNotImplementedError
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ # This module use recursion and regex to find specifics methods on the dom.
5
+ # A better use case for this method is searching on a component view
6
+ # that has small text wrapped only on a html tag without a specific class
7
+ module DepthSearch
8
+ # Search element's children recursively and returns first assertion matched.
9
+ #
10
+ # The assertion must be passed through a proc, and must return nil for
11
+ # false assertions and the element searched for true assetions.
12
+ #
13
+ # @param element [Nokogiri::XML::Element] to be searched.
14
+ # @param proc [Proc] with logic to test assertion.
15
+ # @return [Object, nil] object may be text element or a dom element.
16
+ def depth_search(element, proc)
17
+ res = nil
18
+ counter = 0
19
+ number_of_children = element.children.length
20
+
21
+ while counter < number_of_children
22
+
23
+ child = element.children[counter]
24
+ res = proc.call(child)
25
+
26
+ return res if res # recursion base case #1 - return when found
27
+
28
+ res = depth_search(child, proc)
29
+
30
+ return res if res # recursion base case #2 - return from recursion
31
+
32
+ counter += 1
33
+ end
34
+ res
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ # This helper is a outliner. The purpose of this module is given a
5
+ # better interface for the object generate by Nokogiri.
6
+ # We can think as them as an extansion for the Nokogiri obj element.
7
+ # This module may disappear later.
8
+ module ElementNokogiri
9
+ def element_hidden?(element)
10
+ element['style']&.eql?('display: none')
11
+ end
12
+
13
+ def remove_whitespace(element)
14
+ element.text.gsub(/\s+/, '')
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ # This module has methods for helping on the task of find specific component
5
+ # views (e.g table).
6
+ module Findable
7
+ # Find in the Document the first table with single header and level that
8
+ # matches with such accurancy the argument compare.
9
+ #
10
+ # This method uses Array#find to return object or nil.
11
+ #
12
+ # @return [Nokogiri::XML::Element, nil]
13
+ def find_table_by_header(
14
+ elems,
15
+ compare,
16
+ regex = '[[:alpha:]]+',
17
+ accuracy = 1.0
18
+ )
19
+ elems.find do |table|
20
+ # check only single level header
21
+ thead = table.css('thead').first
22
+ return false if !thead || thead.css('tr').length > 1
23
+
24
+ header = thead.text.scan(Regexp.new(regex))
25
+
26
+ return false if header.empty?
27
+
28
+ matches = (compare & header).length
29
+ (header.length / matches) >= accuracy
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/depth_search'
4
+
5
+ class ScrapCbf
6
+ module MatchesHelper
7
+ include DepthSearch
8
+
9
+ # e.g 0 x 1
10
+ def score_by_team_helper(team, score)
11
+ case team
12
+ when :team
13
+ score.split(' ')[0].to_i if score
14
+ when :opponent
15
+ score.split(' ')[2].to_i if score
16
+ end
17
+ end
18
+
19
+ # 03/02/2021 16:00
20
+ def date_with_start_at_helper(date, start_at)
21
+ return unless date && start_at
22
+
23
+ "#{date} #{start_at}"
24
+ end
25
+
26
+ # teams are extract from <img>
27
+ def scrap_team_names_helper(team_element)
28
+ if team_element.key?('title') &&
29
+ team_element['title'].match?(/^[a-záàâãéèêíïóôõöúç\s]+ - [a-z]{2}$/i)
30
+
31
+ # Extract team's name (e.g Santos - SP => Santos)
32
+ team_element['title'][/^[a-záàâãéèêíïóôõöúç\s]{3,50}/i].strip
33
+ end
34
+ end
35
+
36
+ # Because of problem passing regex, couldn't turn the 5 methods in 1.
37
+ #
38
+ # pass assertive Proc to depth_search
39
+ def find_info_helper(match)
40
+ find = proc do |element|
41
+ if element.text?
42
+ formatted_text = element.text.strip
43
+ unless formatted_text.empty?
44
+ res = formatted_text.match?(
45
+ /Jogo: \d{1,3}$/i
46
+ )
47
+ next formatted_text if res
48
+ end
49
+ end
50
+ nil
51
+ end
52
+
53
+ depth_search(match, find)
54
+ end
55
+
56
+ # Because of problem passing regex, couldn't turn the 5 methods in 1.
57
+ #
58
+ # pass assertive Proc to depth_search
59
+ def find_updates_helper(match)
60
+ find = proc do |element|
61
+ if element.text?
62
+ formatted_text = element.text.strip
63
+ unless formatted_text.empty?
64
+ res = formatted_text.match?(
65
+ /\d{1} (ALTERAÇÃO|ALTERAÇÕES)$/i
66
+ )
67
+ next formatted_text if res
68
+ end
69
+ end
70
+ nil
71
+ end
72
+
73
+ depth_search(match, find)
74
+ end
75
+
76
+ # Because of problem passing regex, couldn't turn the 5 methods in 1.
77
+ #
78
+ # pass assertive Proc to depth_search
79
+ def find_start_at_helper(match)
80
+ find = proc do |element|
81
+ if element.text?
82
+ formatted_text = element.text.strip
83
+ unless formatted_text.empty?
84
+ res = formatted_text.match?(
85
+ /^\d{2}:\d{2}$/i
86
+ )
87
+ next formatted_text if res
88
+ end
89
+ end
90
+ nil
91
+ end
92
+
93
+ depth_search(match, find)
94
+ end
95
+
96
+ # Because of problem passing regex, couldn't turn the 5 methods in 1.
97
+ #
98
+ # pass assertive Proc to depth_search
99
+ def find_score_helper(match)
100
+ find = proc do |element|
101
+ if element.text?
102
+ formatted_text = element.text.strip
103
+ unless formatted_text.empty?
104
+ res = formatted_text.match?(
105
+ /^\d{1} x \d{1}$/i
106
+ )
107
+ next formatted_text if res
108
+ end
109
+ end
110
+ nil
111
+ end
112
+
113
+ depth_search(match, find)
114
+ end
115
+
116
+ # Because of problem passing regex, couldn't turn the 4 methods in 1.
117
+ #
118
+ # pass assertive Proc to depth_search
119
+ def find_place_helper(match)
120
+ find = proc do |element|
121
+ if element.text?
122
+ formatted_text = element.text.strip
123
+ unless formatted_text.empty?
124
+ res = formatted_text.match?(
125
+ /^[a-záàâãéèêíïóôõöúçñ\-\s]+ - [a-záàâãéèêíïóôõöúçñ\s\-]+ - [A-Z]{2}$/i
126
+ )
127
+ next formatted_text if res
128
+ end
129
+ end
130
+ nil
131
+ end
132
+
133
+ depth_search(match, find)
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/element_nokogiri'
4
+ require_relative 'lib/findable'
5
+
6
+ class ScrapCbf
7
+ module RankingsHelper
8
+ include ElementNokogiri
9
+ include Findable
10
+
11
+ def title_or_nil_helper(element)
12
+ child_elem = element.children.find do |elem|
13
+ elem.element? && elem.key?('title')
14
+ end
15
+
16
+ child_elem['title'] if child_elem
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/element_nokogiri'
4
+ require_relative 'lib/findable'
5
+
6
+ class ScrapCbf
7
+ module TeamsHelper
8
+ include ElementNokogiri
9
+ include Findable
10
+ end
11
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ScrapCbf
4
+ class Championship
5
+ include Formattable
6
+ include Printable
7
+
8
+ ATTR_ACCESSORS = %i[
9
+ year
10
+ serie
11
+ ].freeze
12
+
13
+ attr_accessor(*ATTR_ACCESSORS)
14
+
15
+ def initialize(year, serie)
16
+ @year = year
17
+ @serie = serie
18
+ end
19
+
20
+ def to_h
21
+ ATTR_ACCESSORS.each_with_object({}) do |attribute, hash|
22
+ hash[attribute] = send attribute
23
+ end.with_indifferent_access
24
+ end
25
+ end
26
+ end