sportdb-formats 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+ class LeagueIndex
7
+
8
+ def self.build( path )
9
+ pack = Package.new( path ) ## lets us use direcotry or zip archive
10
+
11
+ recs = []
12
+ pack.each_leagues do |entry|
13
+ recs += League.parse( entry.read )
14
+ end
15
+ recs
16
+
17
+ leagues = new
18
+ leagues.add( recs )
19
+ leagues
20
+ end
21
+
22
+
23
+ def catalog() Import.catalog; end
24
+
25
+ def initialize
26
+ @leagues = [] ## leagues by canonical name
27
+ @leagues_by_name = {}
28
+ @errors = []
29
+ end
30
+
31
+ attr_reader :errors
32
+ def errors?() @errors.empty? == false; end
33
+
34
+ def mappings() @leagues_by_name; end ## todo/check: rename to index or something - why? why not?
35
+ def leagues() @leagues.values; end
36
+ alias_method :all, :leagues ## use ActiveRecord-like alias for leagues
37
+
38
+
39
+ ## helpers from club - use a helper module for includes - why? why not?
40
+ include NameHelper
41
+ ## incl. strip_lang( name )
42
+ ## normalize( name )
43
+
44
+
45
+ def add( rec_or_recs ) ## add club record / alt_names
46
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
47
+
48
+ recs.each do |rec|
49
+ ## puts "adding:"
50
+ ## pp rec
51
+ ### step 1) add canonical name
52
+ @leagues << rec
53
+
54
+ ## step 2) add all names (canonical name + alt names + alt names (auto))
55
+ names = [rec.name] + rec.alt_names
56
+ ## check for duplicates - simple check for now - fix/improve
57
+ ## todo/fix: (auto)remove duplicates - why? why not?
58
+ count = names.size
59
+ count_uniq = names.uniq.size
60
+ if count != count_uniq
61
+ puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
62
+ pp names
63
+ pp rec
64
+ exit 1
65
+ end
66
+
67
+ ## todo/fix: move alt_names_auto up for check unique names
68
+ ## e.g. remove/avoid auto-generated duplicates ENG 1, AUT 1, etc
69
+ names += rec.alt_names_auto
70
+
71
+ names.each_with_index do |name,i|
72
+ ## check lang codes e.g. [en], [fr], etc.
73
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
74
+ name = strip_lang( name )
75
+ norm = normalize( name )
76
+ alt_recs = @leagues_by_name[ norm ]
77
+ if alt_recs
78
+ ## check if include club rec already or is new club rec
79
+ if alt_recs.include?( rec )
80
+ ## note: do NOT include duplicate club record
81
+ msg = "** !!! WARN !!! - (norm) name conflict/duplicate for league - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
82
+ puts msg
83
+ @errors << msg
84
+ else
85
+ msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country ? alt_recs[0].country.key : '?'}< with >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
86
+ puts msg
87
+ @errors << msg
88
+ alt_recs << rec
89
+ end
90
+ else
91
+ @leagues_by_name[ norm ] = [rec]
92
+ end
93
+ end
94
+ end
95
+ end # method add
96
+
97
+
98
+ def match( name )
99
+ ## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
100
+ name = normalize( name )
101
+ @leagues_by_name[ name ]
102
+ end
103
+
104
+
105
+ def match_by( name:, country: )
106
+ ## note: match must for now always include name
107
+ m = match( name )
108
+ if m ## filter by country
109
+ ## note: country assumes / allows the country key or fifa code for now
110
+
111
+ ## note: allow passing in of country struct too
112
+ country_rec = if country.is_a?( Country )
113
+ country ## (re)use country struct - no need to run lookup again
114
+ else
115
+ ## note: use own "global" countries index setting for ClubIndex - why? why not?
116
+ rec = catalog.countries.find( country )
117
+ if rec.nil?
118
+ puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
119
+ exit 1
120
+ end
121
+ rec
122
+ end
123
+
124
+ ## note: also skip international leagues & cups (e.g. champions league etc.) for now - why? why not?
125
+ m = m.select { |league| league.country &&
126
+ league.country.key == country_rec.key }
127
+ m = nil if m.empty? ## note: reset to nil if no more matches
128
+ end
129
+ m
130
+ end
131
+
132
+
133
+ def find!( name )
134
+ league = find( name )
135
+ if league.nil?
136
+ puts "** !!! ERROR - no league match found for >#{name}<, add to leagues table; sorry"
137
+ exit 1
138
+ end
139
+ league
140
+ end
141
+
142
+ def find( name )
143
+ league = nil
144
+ m = match( name )
145
+ # pp m
146
+
147
+ if m.nil?
148
+ ## fall through/do nothing
149
+ elsif m.size > 1
150
+ puts "** !!! ERROR - ambigious league name; too many leagues (#{m.size}) found:"
151
+ pp m
152
+ exit 1
153
+ else
154
+ league = m[0]
155
+ end
156
+
157
+ league
158
+ end
159
+
160
+
161
+
162
+
163
+ def dump_duplicates # debug helper - report duplicate club name records
164
+ @leagues_by_name.each do |name, leagues|
165
+ if leagues.size > 1
166
+ puts "#{leagues.size} matching leagues duplicates for >#{name}<:"
167
+ pp leagues
168
+ end
169
+ end
170
+ end
171
+ end # class LeagueIndex
172
+
173
+ end # module Import
174
+ end # module SportDb
@@ -0,0 +1,141 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+
6
+ ## shared "higher-level" outline reader
7
+ ## todo: add CountryOutlineReader - why? why not?
8
+
9
+ class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
10
+
11
+ def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
12
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
13
+ parse( txt, season: season )
14
+ end
15
+
16
+ def self.parse( txt, season: nil )
17
+ new( txt ).parse( season: season )
18
+ end
19
+
20
+
21
+ def initialize( txt )
22
+ @txt = txt
23
+ end
24
+
25
+ def parse( season: nil )
26
+ secs=[] # sec(tion)s
27
+ OutlineReader.parse( @txt ).each do |node|
28
+ if node[0] == :h1
29
+ ## check for league (and stage) and season
30
+ heading = node[1]
31
+ values = split_league( heading )
32
+ if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
33
+ puts "league >#{m[:league]}<, season >#{m[:season]}<"
34
+
35
+ secs << { league: m[:league],
36
+ season: m[:season],
37
+ stage: values[1], ## note: defaults to nil if not present
38
+ lines: []
39
+ }
40
+ else
41
+ puts "** !!! ERROR - cannot match league and season in heading; season missing?"
42
+ pp heading
43
+ exit 1
44
+ end
45
+ elsif node[0] == :p ## paragraph with (text) lines
46
+ lines = node[1]
47
+ ## note: skip lines if no heading seen
48
+ if secs.empty?
49
+ puts "** !!! WARN - skipping lines (no heading):"
50
+ pp lines
51
+ else
52
+ ## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
53
+ secs[-1][:lines] += lines
54
+ end
55
+ else
56
+ puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
57
+ pp node
58
+ exit 1
59
+ end
60
+ end
61
+
62
+
63
+ ## pass 2 - filter seasons if filter present
64
+ if season
65
+ filtered_secs = []
66
+ filter = norm_seasons( season )
67
+ secs.each do |sec|
68
+ if filter.include?( Import::Season.new( sec[:season] ).key )
69
+ filtered_secs << sec
70
+ else
71
+ puts " skipping season >#{sec[:season]}< NOT matched by filter"
72
+ end
73
+ end
74
+ secs = filtered_secs
75
+ end
76
+
77
+ ## pass 3 - check & map; replace inline (string with data struct record)
78
+ secs.each do |sec|
79
+ sec[:season] = Import::Season.new( sec[:season ] )
80
+ sec[:league] = catalog.leagues.find!( sec[:league] )
81
+
82
+ check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
83
+ end
84
+
85
+ secs
86
+ end # method parse
87
+
88
+
89
+
90
+ def catalog() Import.catalog; end ## shortcut convenience helper
91
+
92
+ ## split into league + season
93
+ ## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
94
+ ## World Cup 2018
95
+ LEAGUE_SEASON_HEADING_RE = %r{^
96
+ (?<league>.+?) ## non-greedy
97
+ \s+
98
+ (?<season>\d{4}
99
+ (?:[\/-]\d{1,4})? ## optional 2nd year in season
100
+ )
101
+ $}x
102
+
103
+ def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
104
+ seasons = if season_or_seasons.is_a?( String ) ## wrap in array
105
+ [season_or_seasons]
106
+ else ## assume it's an array already
107
+ season_or_seasons
108
+ end
109
+
110
+ seasons.map { |season| Import::Season.new( season ).key }
111
+ end
112
+
113
+
114
+ def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
115
+ ## split into league / stage / ... e.g.
116
+ ## => Österr. Bundesliga 2018/19, Regular Season
117
+ ## => Österr. Bundesliga 2018/19, Championship Round
118
+ ## etc.
119
+ values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
120
+ values = values.map { |value| value.strip } ## remove all whitespaces
121
+ values
122
+ end
123
+
124
+ def check_stage( name )
125
+ known_stages = ['regular season',
126
+ 'championship round',
127
+ 'relegation round',
128
+ 'play-offs'
129
+ ]
130
+
131
+ if known_stages.include?( name.downcase )
132
+ ## everything ok
133
+ else
134
+ puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
135
+ exit 1
136
+ end
137
+ end
138
+
139
+ end # class LeagueOutlineReader
140
+
141
+ end # module SportDb
@@ -0,0 +1,162 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class LeagueReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
15
+ parse( txt )
16
+ end
17
+
18
+ def self.parse( txt )
19
+ new( txt ).parse
20
+ end
21
+
22
+
23
+
24
+ include Logging
25
+
26
+ def initialize( txt )
27
+ @txt = txt
28
+ end
29
+
30
+ def parse
31
+ recs = []
32
+ last_rec = nil
33
+
34
+ country = nil # last country
35
+ intl = false # is international (league/tournament/cup/competition)
36
+ clubs = true # or clubs|national teams
37
+
38
+ OutlineReader.parse( @txt ).each do |node|
39
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
40
+ heading_level = node[0][1].to_i
41
+ heading = node[1]
42
+
43
+ logger.debug "heading #{heading_level} >#{heading}<"
44
+
45
+ if heading_level != 1
46
+ puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
47
+ pp line
48
+ exit 1
49
+ else
50
+ logger.debug "heading (#{heading_level}) >#{heading}<"
51
+ last_heading = heading
52
+ ## map to country or international / int'l or national teams
53
+ if heading =~ /national team/i ## national team tournament
54
+ country = nil
55
+ intl = true
56
+ clubs = false
57
+ elsif heading =~ /international|int'l/i ## int'l club tournament
58
+ country = nil
59
+ intl = true
60
+ clubs = true
61
+ else
62
+ ## assume country in heading; allow all "formats" supported by parse e.g.
63
+ ## Österreich • Austria (at)
64
+ ## Österreich • Austria
65
+ ## Austria
66
+ ## Deutschland (de) • Germany
67
+ country = catalog.countries.parse( heading )
68
+ intl = false
69
+ clubs = true
70
+
71
+ ## check country code - MUST exist for now!!!!
72
+ if country.nil?
73
+ puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
74
+ exit 1
75
+ end
76
+ end
77
+ end
78
+ elsif node[0] == :p ## paragraph with (text) lines
79
+ lines = node[1]
80
+ lines.each do |line|
81
+
82
+ if line.start_with?( '|' )
83
+ ## assume continuation with line of alternative names
84
+ ## note: skip leading pipe
85
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
86
+ ## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
87
+ ## 2) strip and squish (white)spaces
88
+ # e.g. New York FC (2011-) => New York FC (2011-)
89
+ values = values.map { |value| value.gsub( '$', '' )
90
+ .gsub( /[ \t]+/, ' ' )
91
+ .strip }
92
+ logger.debug "alt_names: #{values.join( '|' )}"
93
+
94
+ last_rec.alt_names += values
95
+ else
96
+ ## assume "regular" line
97
+ ## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
98
+ if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
99
+ league_key = $1
100
+ ## 1) strip (commercial) sponsor markers/tags e.g $$
101
+ ## 2) strip and squish (white)spaces
102
+ league_name = $2.gsub( '$', '' )
103
+ .gsub( /[ \t]+/, ' ' )
104
+ .strip
105
+
106
+ logger.debug "key: >#{league_key}<, name: >#{league_name}<"
107
+
108
+
109
+ alt_names_auto = []
110
+ if country
111
+ alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
112
+ ## todo/check: add "hack" for cl (chile) and exclude?
113
+ ## add a list of (auto-)excluded country codes with conflicts? why? why not?
114
+ ## cl - a) Chile b) Champions League
115
+ alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
116
+ if country.key.upcase != country.fifa
117
+ alt_names_auto << "#{country.fifa} #{league_key.upcase.gsub('.', ' ')}"
118
+ alt_names_auto << "#{country.fifa}" if league_key == '1' ## add shortcut for top level 1 (just country key)
119
+ end
120
+ alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
121
+ else ## assume int'l (no country) e.g. champions league, etc.
122
+ ## only auto-add key (e.g. CL, EL, etc.)
123
+ alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
124
+ end
125
+
126
+ pp alt_names_auto
127
+
128
+ ## prepend country key/code if country present
129
+ ## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
130
+ ## why? lets you "overwrite" key if desired - use it - why? why not?
131
+ if country
132
+ league_key = "#{country.key}.#{league_key}"
133
+ end
134
+
135
+ rec = League.new( key: league_key,
136
+ name: league_name,
137
+ alt_names_auto: alt_names_auto,
138
+ country: country,
139
+ intl: intl,
140
+ clubs: clubs)
141
+ recs << rec
142
+ last_rec = rec
143
+ else
144
+ puts "** !!! ERROR !!! missing key for (canonical) league name"
145
+ exit 1
146
+ end
147
+ end
148
+ end # each line
149
+ else
150
+ puts "** !!! ERROR !!! [league reader] - unknown line type:"
151
+ pp node
152
+ exit 1
153
+ end
154
+ ## pp line
155
+ end
156
+ recs
157
+ end # method parse
158
+
159
+ end # class LeagueReader
160
+
161
+ end ## module Import
162
+ end ## module SportDb