sportdb-formats 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
-
6
- ## shared "higher-level" outline reader
7
- ## todo: add CountryOutlineReader - why? why not?
8
-
9
- class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
10
-
11
- def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ) {|f| f.read }
13
- parse( txt, season: season )
14
- end
15
-
16
- def self.parse( txt, season: nil )
17
- new( txt ).parse( season: season )
18
- end
19
-
20
-
21
- def initialize( txt )
22
- @txt = txt
23
- end
24
-
25
- def parse( season: nil )
26
- secs=[] # sec(tion)s
27
- OutlineReader.parse( @txt ).each do |node|
28
- if node[0] == :h1
29
- ## check for league (and stage) and season
30
- heading = node[1]
31
- values = split_league( heading )
32
- if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
33
- puts "league >#{m[:league]}<, season >#{m[:season]}<"
34
-
35
- secs << { league: m[:league],
36
- season: m[:season],
37
- stage: values[1], ## note: defaults to nil if not present
38
- lines: []
39
- }
40
- else
41
- puts "** !!! ERROR - cannot match league and season in heading; season missing?"
42
- pp heading
43
- exit 1
44
- end
45
- elsif node[0] == :p ## paragraph with (text) lines
46
- lines = node[1]
47
- ## note: skip lines if no heading seen
48
- if secs.empty?
49
- puts "** !!! WARN - skipping lines (no heading):"
50
- pp lines
51
- else
52
- ## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
53
- secs[-1][:lines] += lines
54
- end
55
- else
56
- puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
57
- pp node
58
- exit 1
59
- end
60
- end
61
-
62
-
63
- ## pass 2 - filter seasons if filter present
64
- if season
65
- filtered_secs = []
66
- filter = norm_seasons( season )
67
- secs.each do |sec|
68
- if filter.include?( Season.parse( sec[:season] ).key )
69
- filtered_secs << sec
70
- else
71
- puts " skipping season >#{sec[:season]}< NOT matched by filter"
72
- end
73
- end
74
- secs = filtered_secs
75
- end
76
-
77
- ## pass 3 - check & map; replace inline (string with data struct record)
78
- secs.each do |sec|
79
- sec[:season] = Season.parse( sec[:season ] )
80
- sec[:league] = catalog.leagues.find!( sec[:league] )
81
-
82
- check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
83
- end
84
-
85
- secs
86
- end # method parse
87
-
88
-
89
-
90
- def catalog() Import.catalog; end ## shortcut convenience helper
91
-
92
- ## split into league + season
93
- ## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
94
- ## World Cup 2018
95
- LEAGUE_SEASON_HEADING_RE = %r{^
96
- (?<league>.+?) ## non-greedy
97
- \s+
98
- (?<season>\d{4}
99
- (?:[\/-]\d{1,4})? ## optional 2nd year in season
100
- )
101
- $}x
102
-
103
-
104
- def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
105
-
106
- seasons = if season_or_seasons.is_a?( Array ) # is it an array already
107
- season_or_seasons
108
- elsif season_or_seasons.is_a?( Range ) # e.g. Season(1999)..Season(2001) or such
109
- season_or_seasons.to_a
110
- else ## assume - single entry - wrap in array
111
- [season_or_seasons]
112
- end
113
-
114
- seasons.map { |season| Season( season ).key }
115
- end
116
-
117
-
118
- def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
119
- ## split into league / stage / ... e.g.
120
- ## => Österr. Bundesliga 2018/19, Regular Season
121
- ## => Österr. Bundesliga 2018/19, Championship Round
122
- ## etc.
123
- values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
124
- values = values.map { |value| value.strip } ## remove all whitespaces
125
- values
126
- end
127
-
128
-
129
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
130
- KNOWN_STAGES = [
131
- 'Regular Season',
132
- 'Regular Stage',
133
- 'Championship Round',
134
- 'Championship Playoff', # or Championship play-off
135
- 'Relegation Round',
136
- 'Relegation Playoff',
137
- 'Play-offs',
138
- 'Playoff Stage',
139
- 'Grunddurchgang',
140
- 'Finaldurchgang - Qualifikationsgruppe',
141
- 'Finaldurchgang - Qualifikation',
142
- 'Finaldurchgang - Meistergruppe',
143
- 'Finaldurchgang - Meister',
144
- 'EL Play-off',
145
- 'Europa League Play-off',
146
- 'Europa-League-Play-offs',
147
- 'Europa League Finals',
148
- 'Playoffs - Championship',
149
- 'Playoffs - Europa League',
150
- 'Playoffs - Europa League - Finals',
151
- 'Playoffs - Relegation',
152
- 'Playoffs - Challenger',
153
- 'Finals',
154
- 'Match 6th Place', # e.g. Super League Greece 2012/13
155
-
156
- 'Apertura',
157
- 'Apertura - Liguilla',
158
- 'Clausura',
159
- 'Clausura - Liguilla',
160
-
161
- ].map {|name| name.downcase.gsub( /[^a-z]/, '' ) }
162
-
163
-
164
- def check_stage( name )
165
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
166
- if KNOWN_STAGES.include?( name.downcase.gsub( /[^a-z]/, '' ) )
167
- ## everything ok
168
- else
169
- puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
170
- exit 1
171
- end
172
- end
173
-
174
- end # class LeagueOutlineReader
175
-
176
- end # module SportDb
@@ -1,152 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
-
5
-
6
- class LeagueReader
7
-
8
- def world() Import.world; end
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
13
- parse( txt )
14
- end
15
-
16
- def self.parse( txt )
17
- new( txt ).parse
18
- end
19
-
20
-
21
-
22
- include Logging
23
-
24
- def initialize( txt )
25
- @txt = txt
26
- end
27
-
28
- def parse
29
- recs = []
30
- last_rec = nil
31
-
32
- country = nil # last country
33
- intl = false # is international (league/tournament/cup/competition)
34
- clubs = true # or clubs|national teams
35
-
36
- OutlineReader.parse( @txt ).each do |node|
37
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
38
- heading_level = node[0][1].to_i
39
- heading = node[1]
40
-
41
- logger.debug "heading #{heading_level} >#{heading}<"
42
-
43
- if heading_level != 1
44
- puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
45
- pp line
46
- exit 1
47
- else
48
- logger.debug "heading (#{heading_level}) >#{heading}<"
49
- last_heading = heading
50
- ## map to country or international / int'l or national teams
51
- if heading =~ /national team/i ## national team tournament
52
- country = nil
53
- intl = true
54
- clubs = false
55
- elsif heading =~ /international|int'l/i ## int'l club tournament
56
- country = nil
57
- intl = true
58
- clubs = true
59
- else
60
- ## assume country in heading; allow all "formats" supported by parse e.g.
61
- ## Österreich • Austria (at)
62
- ## Österreich • Austria
63
- ## Austria
64
- ## Deutschland (de) • Germany
65
- country = world.countries.parse( heading )
66
- intl = false
67
- clubs = true
68
-
69
- ## check country code - MUST exist for now!!!!
70
- if country.nil?
71
- puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
72
- exit 1
73
- end
74
- end
75
- end
76
- elsif node[0] == :p ## paragraph with (text) lines
77
- lines = node[1]
78
- lines.each do |line|
79
-
80
- if line.start_with?( '|' )
81
- ## assume continuation with line of alternative names
82
- ## note: skip leading pipe
83
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
84
- values = values.map {|value| _norm(value) } ## squish/strip etc.
85
-
86
- logger.debug "alt_names: #{values.join( '|' )}"
87
-
88
- last_rec.alt_names += values
89
- else
90
- ## assume "regular" line
91
- ## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
92
- if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
93
- league_key = $1
94
- ## 1) strip (commercial) sponsor markers/tags e.g $$
95
- ## 2) strip and squish (white)spaces
96
- league_name = _norm( $2 )
97
-
98
- logger.debug "key: >#{league_key}<, name: >#{league_name}<"
99
-
100
-
101
- ## prepend country key/code if country present
102
- ## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
103
- ## why? lets you "overwrite" key if desired - use it - why? why not?
104
- if country
105
- league_key = "#{country.key}.#{league_key}"
106
- end
107
-
108
- rec = League.new( key: league_key,
109
- name: league_name,
110
- country: country,
111
- intl: intl,
112
- clubs: clubs)
113
- recs << rec
114
- last_rec = rec
115
- else
116
- puts "** !!! ERROR !!! missing key for (canonical) league name"
117
- exit 1
118
- end
119
- end
120
- end # each line
121
- else
122
- puts "** !!! ERROR !!! [league reader] - unknown line type:"
123
- pp node
124
- exit 1
125
- end
126
- ## pp line
127
- end
128
- recs
129
- end # method parse
130
-
131
-
132
-
133
- #######################
134
- ### helpers
135
-
136
- ## norm(alize) helper - squish (spaces)
137
- ## and remove dollars ($$$)
138
- ## and remove leading and trailing spaces
139
- def _norm( str )
140
- ## only extra clean-up of dollars for now ($$$)
141
- _squish( str.gsub( '$', '' ) )
142
- end
143
-
144
- def _squish( str )
145
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
146
- end
147
-
148
-
149
- end # class LeagueReader
150
-
151
- end ## module Import
152
- end ## module SportDb
@@ -1,132 +0,0 @@
1
- module SportDb
2
-
3
- class ConfParser
4
-
5
- def self.parse( lines )
6
- parser = new( lines )
7
- parser.parse
8
- end
9
-
10
- include Logging ## e.g. logger#debug, logger#info, etc.
11
-
12
- def _read_lines( txt ) ## todo/check: add alias preproc_lines or build_lines or prep_lines etc. - why? why not?
13
- ## returns an array of lines with comments and empty lines striped / removed
14
- lines = []
15
- txt.each_line do |line| ## preprocess
16
- line = line.strip
17
-
18
- next if line.empty? || line.start_with?('#') ### skip empty lines and comments
19
- line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
20
- lines << line
21
- end
22
- lines
23
- end
24
-
25
-
26
- def initialize( lines )
27
- # for convenience split string into lines
28
- ## note: removes/strips empty lines
29
- ## todo/check: change to text instead of array of lines - why? why not?
30
- @lines = lines.is_a?( String ) ? _read_lines( lines ) : lines
31
- end
32
-
33
-
34
-
35
- COUNTRY_RE = %r{ [<>‹›,]
36
- [ ]*
37
- (?<country>[A-Z]{2,4}) ## todo/check: allow one-letter (motor vehicle plates) or 5 letter possible?
38
- \b}xi
39
-
40
-
41
- ## standings table row regex matcher e.g.
42
- ## 1 Manchester City 38 32 4 2 106-27 100
43
- ## or 1. Manchester City 38 32 4 2 106:27 100
44
- TABLE_RE = %r{ ^
45
- (?:
46
- (?<rank>\d+)\.?
47
- |
48
- [-]
49
- )
50
- [ ]+
51
- (?<team>.+?) ## note: let's use non-greedy (MINIMUM length) match for now
52
- [ ]+
53
- (?<pld>\d+) ## (pl)aye(d)
54
- [ ]+
55
- (?<w>\d+) ## (w)ins
56
- [ ]+
57
- (?<d>\d+) ## (d)raws
58
- [ ]+
59
- (?<l>\d+) ## (l)ost
60
- [ ]+
61
- (?<gf>\d+) ## (g)oal (f)or
62
- [ ]*
63
- [:-] ## note: allow 10-10 or 10:10 or 10 - 10 or 10 : 10 etc.
64
- [ ]*
65
- (?<ga>\d+) ## (g)oal (a)gainst
66
- (?: ## allow optional (g)oal (d)ifference
67
- [ ]+
68
- (?<gd>[±+-]?\d+) ## (g)oal (d)ifference
69
- )?
70
- [ ]+
71
- (?<pts>\d+) ## (p)oin(ts)
72
- (?: ## allow optional deductions e.g. [-7]
73
- [ ]+
74
- \[(?<deduction>-\d+)\]
75
- )?
76
- $}x
77
-
78
- def parse
79
- teams = {} ## convert lines to teams
80
-
81
- @lines.each do |line|
82
- next if line =~ /^[ -]+$/ ## skip decorative lines with dash only (e.g. ---- or - - - -) etc.
83
-
84
-
85
- ## quick hack - check for/extract (optional) county code (for teams) first
86
- ## allow as separators <>‹›, NOTE: includes (,) comma for now too
87
- m = nil
88
- country = nil
89
- if m=COUNTRY_RE.match( line )
90
- country = m[:country]
91
- line = line.sub( m[0], '' ) ## replace match with nothing for now
92
- end
93
-
94
- if m=TABLE_RE.match( line )
95
- puts " matching table entry >#{line}<"
96
-
97
- name = m[:team]
98
- rank = m[:rank] ? Integer(m[:rank]) : nil
99
-
100
- standing = {
101
- pld: Integer(m[:pld]),
102
- w: Integer(m[:w]),
103
- d: Integer(m[:d]),
104
- l: Integer(m[:l]),
105
- gf: Integer(m[:gf]),
106
- ga: Integer(m[:ga]),
107
- }
108
- standing[ :gd ] = Integer(m[:gd].gsub(/[±+]/,'')) if m[:gd]
109
- standing[ :pts ] = Integer(m[:pts])
110
- standing[ :deduction ] = Integer(m[:deduction]) if m[:deduction]
111
-
112
-
113
- ## todo/fix: track double usage - why? why not? report/raise error/exception on duplicates?
114
- team = teams[ name ] ||= { }
115
- team[ :country ] = country if country
116
-
117
- team[ :rank ] = rank if rank
118
- team[ :standing ] = standing if standing
119
- else
120
- ## assume team is full line
121
- name = line.strip # note: strip leading and trailing spaces
122
-
123
- team = teams[ name ] ||= { }
124
- team[ :country ] = country if country
125
- end
126
- end
127
-
128
- teams
129
- end # method parse
130
-
131
- end # class ConfParser
132
- end # module SportDb