sportdb-formats 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
-
6
- ## shared "higher-level" outline reader
7
- ## todo: add CountryOutlineReader - why? why not?
8
-
9
- class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
10
-
11
- def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ) {|f| f.read }
13
- parse( txt, season: season )
14
- end
15
-
16
- def self.parse( txt, season: nil )
17
- new( txt ).parse( season: season )
18
- end
19
-
20
-
21
- def initialize( txt )
22
- @txt = txt
23
- end
24
-
25
- def parse( season: nil )
26
- secs=[] # sec(tion)s
27
- OutlineReader.parse( @txt ).each do |node|
28
- if node[0] == :h1
29
- ## check for league (and stage) and season
30
- heading = node[1]
31
- values = split_league( heading )
32
- if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
33
- puts "league >#{m[:league]}<, season >#{m[:season]}<"
34
-
35
- secs << { league: m[:league],
36
- season: m[:season],
37
- stage: values[1], ## note: defaults to nil if not present
38
- lines: []
39
- }
40
- else
41
- puts "** !!! ERROR - cannot match league and season in heading; season missing?"
42
- pp heading
43
- exit 1
44
- end
45
- elsif node[0] == :p ## paragraph with (text) lines
46
- lines = node[1]
47
- ## note: skip lines if no heading seen
48
- if secs.empty?
49
- puts "** !!! WARN - skipping lines (no heading):"
50
- pp lines
51
- else
52
- ## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
53
- secs[-1][:lines] += lines
54
- end
55
- else
56
- puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
57
- pp node
58
- exit 1
59
- end
60
- end
61
-
62
-
63
- ## pass 2 - filter seasons if filter present
64
- if season
65
- filtered_secs = []
66
- filter = norm_seasons( season )
67
- secs.each do |sec|
68
- if filter.include?( Season.parse( sec[:season] ).key )
69
- filtered_secs << sec
70
- else
71
- puts " skipping season >#{sec[:season]}< NOT matched by filter"
72
- end
73
- end
74
- secs = filtered_secs
75
- end
76
-
77
- ## pass 3 - check & map; replace inline (string with data struct record)
78
- secs.each do |sec|
79
- sec[:season] = Season.parse( sec[:season ] )
80
- sec[:league] = catalog.leagues.find!( sec[:league] )
81
-
82
- check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
83
- end
84
-
85
- secs
86
- end # method parse
87
-
88
-
89
-
90
- def catalog() Import.catalog; end ## shortcut convenience helper
91
-
92
- ## split into league + season
93
- ## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
94
- ## World Cup 2018
95
- LEAGUE_SEASON_HEADING_RE = %r{^
96
- (?<league>.+?) ## non-greedy
97
- \s+
98
- (?<season>\d{4}
99
- (?:[\/-]\d{1,4})? ## optional 2nd year in season
100
- )
101
- $}x
102
-
103
-
104
- def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
105
-
106
- seasons = if season_or_seasons.is_a?( Array ) # is it an array already
107
- season_or_seasons
108
- elsif season_or_seasons.is_a?( Range ) # e.g. Season(1999)..Season(2001) or such
109
- season_or_seasons.to_a
110
- else ## assume - single entry - wrap in array
111
- [season_or_seasons]
112
- end
113
-
114
- seasons.map { |season| Season( season ).key }
115
- end
116
-
117
-
118
- def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
119
- ## split into league / stage / ... e.g.
120
- ## => Österr. Bundesliga 2018/19, Regular Season
121
- ## => Österr. Bundesliga 2018/19, Championship Round
122
- ## etc.
123
- values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
124
- values = values.map { |value| value.strip } ## remove all whitespaces
125
- values
126
- end
127
-
128
-
129
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
130
- KNOWN_STAGES = [
131
- 'Regular Season',
132
- 'Regular Stage',
133
- 'Championship Round',
134
- 'Championship Playoff', # or Championship play-off
135
- 'Relegation Round',
136
- 'Relegation Playoff',
137
- 'Play-offs',
138
- 'Playoff Stage',
139
- 'Grunddurchgang',
140
- 'Finaldurchgang - Qualifikationsgruppe',
141
- 'Finaldurchgang - Qualifikation',
142
- 'Finaldurchgang - Meistergruppe',
143
- 'Finaldurchgang - Meister',
144
- 'EL Play-off',
145
- 'Europa League Play-off',
146
- 'Europa-League-Play-offs',
147
- 'Europa League Finals',
148
- 'Playoffs - Championship',
149
- 'Playoffs - Europa League',
150
- 'Playoffs - Europa League - Finals',
151
- 'Playoffs - Relegation',
152
- 'Playoffs - Challenger',
153
- 'Finals',
154
- 'Match 6th Place', # e.g. Super League Greece 2012/13
155
-
156
- 'Apertura',
157
- 'Apertura - Liguilla',
158
- 'Clausura',
159
- 'Clausura - Liguilla',
160
-
161
- ].map {|name| name.downcase.gsub( /[^a-z]/, '' ) }
162
-
163
-
164
- def check_stage( name )
165
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
166
- if KNOWN_STAGES.include?( name.downcase.gsub( /[^a-z]/, '' ) )
167
- ## everything ok
168
- else
169
- puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
170
- exit 1
171
- end
172
- end
173
-
174
- end # class LeagueOutlineReader
175
-
176
- end # module SportDb
@@ -1,152 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
-
5
-
6
- class LeagueReader
7
-
8
- def world() Import.world; end
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
13
- parse( txt )
14
- end
15
-
16
- def self.parse( txt )
17
- new( txt ).parse
18
- end
19
-
20
-
21
-
22
- include Logging
23
-
24
- def initialize( txt )
25
- @txt = txt
26
- end
27
-
28
- def parse
29
- recs = []
30
- last_rec = nil
31
-
32
- country = nil # last country
33
- intl = false # is international (league/tournament/cup/competition)
34
- clubs = true # or clubs|national teams
35
-
36
- OutlineReader.parse( @txt ).each do |node|
37
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
38
- heading_level = node[0][1].to_i
39
- heading = node[1]
40
-
41
- logger.debug "heading #{heading_level} >#{heading}<"
42
-
43
- if heading_level != 1
44
- puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
45
- pp line
46
- exit 1
47
- else
48
- logger.debug "heading (#{heading_level}) >#{heading}<"
49
- last_heading = heading
50
- ## map to country or international / int'l or national teams
51
- if heading =~ /national team/i ## national team tournament
52
- country = nil
53
- intl = true
54
- clubs = false
55
- elsif heading =~ /international|int'l/i ## int'l club tournament
56
- country = nil
57
- intl = true
58
- clubs = true
59
- else
60
- ## assume country in heading; allow all "formats" supported by parse e.g.
61
- ## Österreich • Austria (at)
62
- ## Österreich • Austria
63
- ## Austria
64
- ## Deutschland (de) • Germany
65
- country = world.countries.parse( heading )
66
- intl = false
67
- clubs = true
68
-
69
- ## check country code - MUST exist for now!!!!
70
- if country.nil?
71
- puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
72
- exit 1
73
- end
74
- end
75
- end
76
- elsif node[0] == :p ## paragraph with (text) lines
77
- lines = node[1]
78
- lines.each do |line|
79
-
80
- if line.start_with?( '|' )
81
- ## assume continuation with line of alternative names
82
- ## note: skip leading pipe
83
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
84
- values = values.map {|value| _norm(value) } ## squish/strip etc.
85
-
86
- logger.debug "alt_names: #{values.join( '|' )}"
87
-
88
- last_rec.alt_names += values
89
- else
90
- ## assume "regular" line
91
- ## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
92
- if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
93
- league_key = $1
94
- ## 1) strip (commercial) sponsor markers/tags e.g $$
95
- ## 2) strip and squish (white)spaces
96
- league_name = _norm( $2 )
97
-
98
- logger.debug "key: >#{league_key}<, name: >#{league_name}<"
99
-
100
-
101
- ## prepend country key/code if country present
102
- ## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
103
- ## why? lets you "overwrite" key if desired - use it - why? why not?
104
- if country
105
- league_key = "#{country.key}.#{league_key}"
106
- end
107
-
108
- rec = League.new( key: league_key,
109
- name: league_name,
110
- country: country,
111
- intl: intl,
112
- clubs: clubs)
113
- recs << rec
114
- last_rec = rec
115
- else
116
- puts "** !!! ERROR !!! missing key for (canonical) league name"
117
- exit 1
118
- end
119
- end
120
- end # each line
121
- else
122
- puts "** !!! ERROR !!! [league reader] - unknown line type:"
123
- pp node
124
- exit 1
125
- end
126
- ## pp line
127
- end
128
- recs
129
- end # method parse
130
-
131
-
132
-
133
- #######################
134
- ### helpers
135
-
136
- ## norm(alize) helper - squish (spaces)
137
- ## and remove dollars ($$$)
138
- ## and remove leading and trailing spaces
139
- def _norm( str )
140
- ## only extra clean-up of dollars for now ($$$)
141
- _squish( str.gsub( '$', '' ) )
142
- end
143
-
144
- def _squish( str )
145
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
146
- end
147
-
148
-
149
- end # class LeagueReader
150
-
151
- end ## module Import
152
- end ## module SportDb
@@ -1,132 +0,0 @@
1
- module SportDb
2
-
3
- class ConfParser
4
-
5
- def self.parse( lines )
6
- parser = new( lines )
7
- parser.parse
8
- end
9
-
10
- include Logging ## e.g. logger#debug, logger#info, etc.
11
-
12
- def _read_lines( txt ) ## todo/check: add alias preproc_lines or build_lines or prep_lines etc. - why? why not?
13
- ## returns an array of lines with comments and empty lines striped / removed
14
- lines = []
15
- txt.each_line do |line| ## preprocess
16
- line = line.strip
17
-
18
- next if line.empty? || line.start_with?('#') ### skip empty lines and comments
19
- line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
20
- lines << line
21
- end
22
- lines
23
- end
24
-
25
-
26
- def initialize( lines )
27
- # for convenience split string into lines
28
- ## note: removes/strips empty lines
29
- ## todo/check: change to text instead of array of lines - why? why not?
30
- @lines = lines.is_a?( String ) ? _read_lines( lines ) : lines
31
- end
32
-
33
-
34
-
35
- COUNTRY_RE = %r{ [<>‹›,]
36
- [ ]*
37
- (?<country>[A-Z]{2,4}) ## todo/check: allow one-letter (motor vehicle plates) or 5 letter possible?
38
- \b}xi
39
-
40
-
41
- ## standings table row regex matcher e.g.
42
- ## 1 Manchester City 38 32 4 2 106-27 100
43
- ## or 1. Manchester City 38 32 4 2 106:27 100
44
- TABLE_RE = %r{ ^
45
- (?:
46
- (?<rank>\d+)\.?
47
- |
48
- [-]
49
- )
50
- [ ]+
51
- (?<team>.+?) ## note: let's use non-greedy (MINIMUM length) match for now
52
- [ ]+
53
- (?<pld>\d+) ## (pl)aye(d)
54
- [ ]+
55
- (?<w>\d+) ## (w)ins
56
- [ ]+
57
- (?<d>\d+) ## (d)raws
58
- [ ]+
59
- (?<l>\d+) ## (l)ost
60
- [ ]+
61
- (?<gf>\d+) ## (g)oal (f)or
62
- [ ]*
63
- [:-] ## note: allow 10-10 or 10:10 or 10 - 10 or 10 : 10 etc.
64
- [ ]*
65
- (?<ga>\d+) ## (g)oal (a)gainst
66
- (?: ## allow optional (g)oal (d)ifference
67
- [ ]+
68
- (?<gd>[±+-]?\d+) ## (g)oal (d)ifference
69
- )?
70
- [ ]+
71
- (?<pts>\d+) ## (p)oin(ts)
72
- (?: ## allow optional deductions e.g. [-7]
73
- [ ]+
74
- \[(?<deduction>-\d+)\]
75
- )?
76
- $}x
77
-
78
- def parse
79
- teams = {} ## convert lines to teams
80
-
81
- @lines.each do |line|
82
- next if line =~ /^[ -]+$/ ## skip decorative lines with dash only (e.g. ---- or - - - -) etc.
83
-
84
-
85
- ## quick hack - check for/extract (optional) county code (for teams) first
86
- ## allow as separators <>‹›, NOTE: includes (,) comma for now too
87
- m = nil
88
- country = nil
89
- if m=COUNTRY_RE.match( line )
90
- country = m[:country]
91
- line = line.sub( m[0], '' ) ## replace match with nothing for now
92
- end
93
-
94
- if m=TABLE_RE.match( line )
95
- puts " matching table entry >#{line}<"
96
-
97
- name = m[:team]
98
- rank = m[:rank] ? Integer(m[:rank]) : nil
99
-
100
- standing = {
101
- pld: Integer(m[:pld]),
102
- w: Integer(m[:w]),
103
- d: Integer(m[:d]),
104
- l: Integer(m[:l]),
105
- gf: Integer(m[:gf]),
106
- ga: Integer(m[:ga]),
107
- }
108
- standing[ :gd ] = Integer(m[:gd].gsub(/[±+]/,'')) if m[:gd]
109
- standing[ :pts ] = Integer(m[:pts])
110
- standing[ :deduction ] = Integer(m[:deduction]) if m[:deduction]
111
-
112
-
113
- ## todo/fix: track double usage - why? why not? report/raise error/exception on duplicates?
114
- team = teams[ name ] ||= { }
115
- team[ :country ] = country if country
116
-
117
- team[ :rank ] = rank if rank
118
- team[ :standing ] = standing if standing
119
- else
120
- ## assume team is full line
121
- name = line.strip # note: strip leading and trailing spaces
122
-
123
- team = teams[ name ] ||= { }
124
- team[ :country ] = country if country
125
- end
126
- end
127
-
128
- teams
129
- end # method parse
130
-
131
- end # class ConfParser
132
- end # module SportDb