sportdb-formats 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,174 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+ class LeagueIndex
7
+
8
+ def self.build( path )
9
+ pack = Package.new( path ) ## lets us use direcotry or zip archive
10
+
11
+ recs = []
12
+ pack.each_leagues do |entry|
13
+ recs += League.parse( entry.read )
14
+ end
15
+ recs
16
+
17
+ leagues = new
18
+ leagues.add( recs )
19
+ leagues
20
+ end
21
+
22
+
23
+ def catalog() Import.catalog; end
24
+
25
+ def initialize
26
+ @leagues = [] ## leagues by canonical name
27
+ @leagues_by_name = {}
28
+ @errors = []
29
+ end
30
+
31
+ attr_reader :errors
32
+ def errors?() @errors.empty? == false; end
33
+
34
+ def mappings() @leagues_by_name; end ## todo/check: rename to index or something - why? why not?
35
+ def leagues() @leagues.values; end
36
+ alias_method :all, :leagues ## use ActiveRecord-like alias for leagues
37
+
38
+
39
+ ## helpers from club - use a helper module for includes - why? why not?
40
+ include NameHelper
41
+ ## incl. strip_lang( name )
42
+ ## normalize( name )
43
+
44
+
45
+ def add( rec_or_recs ) ## add club record / alt_names
46
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
47
+
48
+ recs.each do |rec|
49
+ ## puts "adding:"
50
+ ## pp rec
51
+ ### step 1) add canonical name
52
+ @leagues << rec
53
+
54
+ ## step 2) add all names (canonical name + alt names + alt names (auto))
55
+ names = [rec.name] + rec.alt_names
56
+ ## check for duplicates - simple check for now - fix/improve
57
+ ## todo/fix: (auto)remove duplicates - why? why not?
58
+ count = names.size
59
+ count_uniq = names.uniq.size
60
+ if count != count_uniq
61
+ puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
62
+ pp names
63
+ pp rec
64
+ exit 1
65
+ end
66
+
67
+ ## todo/fix: move alt_names_auto up for check unique names
68
+ ## e.g. remove/avoid auto-generated duplicates ENG 1, AUT 1, etc
69
+ names += rec.alt_names_auto
70
+
71
+ names.each_with_index do |name,i|
72
+ ## check lang codes e.g. [en], [fr], etc.
73
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
74
+ name = strip_lang( name )
75
+ norm = normalize( name )
76
+ alt_recs = @leagues_by_name[ norm ]
77
+ if alt_recs
78
+ ## check if include club rec already or is new club rec
79
+ if alt_recs.include?( rec )
80
+ ## note: do NOT include duplicate club record
81
+ msg = "** !!! WARN !!! - (norm) name conflict/duplicate for league - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
82
+ puts msg
83
+ @errors << msg
84
+ else
85
+ msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country ? alt_recs[0].country.key : '?'}< with >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
86
+ puts msg
87
+ @errors << msg
88
+ alt_recs << rec
89
+ end
90
+ else
91
+ @leagues_by_name[ norm ] = [rec]
92
+ end
93
+ end
94
+ end
95
+ end # method add
96
+
97
+
98
+ def match( name )
99
+ ## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
100
+ name = normalize( name )
101
+ @leagues_by_name[ name ]
102
+ end
103
+
104
+
105
+ def match_by( name:, country: )
106
+ ## note: match must for now always include name
107
+ m = match( name )
108
+ if m ## filter by country
109
+ ## note: country assumes / allows the country key or fifa code for now
110
+
111
+ ## note: allow passing in of country struct too
112
+ country_rec = if country.is_a?( Country )
113
+ country ## (re)use country struct - no need to run lookup again
114
+ else
115
+ ## note: use own "global" countries index setting for ClubIndex - why? why not?
116
+ rec = catalog.countries.find( country )
117
+ if rec.nil?
118
+ puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
119
+ exit 1
120
+ end
121
+ rec
122
+ end
123
+
124
+ ## note: also skip international leagues & cups (e.g. champions league etc.) for now - why? why not?
125
+ m = m.select { |league| league.country &&
126
+ league.country.key == country_rec.key }
127
+ m = nil if m.empty? ## note: reset to nil if no more matches
128
+ end
129
+ m
130
+ end
131
+
132
+
133
+ def find!( name )
134
+ league = find( name )
135
+ if league.nil?
136
+ puts "** !!! ERROR - no league match found for >#{name}<, add to leagues table; sorry"
137
+ exit 1
138
+ end
139
+ league
140
+ end
141
+
142
+ def find( name )
143
+ league = nil
144
+ m = match( name )
145
+ # pp m
146
+
147
+ if m.nil?
148
+ ## fall through/do nothing
149
+ elsif m.size > 1
150
+ puts "** !!! ERROR - ambigious league name; too many leagues (#{m.size}) found:"
151
+ pp m
152
+ exit 1
153
+ else
154
+ league = m[0]
155
+ end
156
+
157
+ league
158
+ end
159
+
160
+
161
+
162
+
163
+ def dump_duplicates # debug helper - report duplicate club name records
164
+ @leagues_by_name.each do |name, leagues|
165
+ if leagues.size > 1
166
+ puts "#{leagues.size} matching leagues duplicates for >#{name}<:"
167
+ pp leagues
168
+ end
169
+ end
170
+ end
171
+ end # class LeagueIndex
172
+
173
+ end # module Import
174
+ end # module SportDb
@@ -0,0 +1,141 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+
6
+ ## shared "higher-level" outline reader
7
+ ## todo: add CountryOutlineReader - why? why not?
8
+
9
+ class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
10
+
11
+ def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
12
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
13
+ parse( txt, season: season )
14
+ end
15
+
16
+ def self.parse( txt, season: nil )
17
+ new( txt ).parse( season: season )
18
+ end
19
+
20
+
21
+ def initialize( txt )
22
+ @txt = txt
23
+ end
24
+
25
+ def parse( season: nil )
26
+ secs=[] # sec(tion)s
27
+ OutlineReader.parse( @txt ).each do |node|
28
+ if node[0] == :h1
29
+ ## check for league (and stage) and season
30
+ heading = node[1]
31
+ values = split_league( heading )
32
+ if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
33
+ puts "league >#{m[:league]}<, season >#{m[:season]}<"
34
+
35
+ secs << { league: m[:league],
36
+ season: m[:season],
37
+ stage: values[1], ## note: defaults to nil if not present
38
+ lines: []
39
+ }
40
+ else
41
+ puts "** !!! ERROR - cannot match league and season in heading; season missing?"
42
+ pp heading
43
+ exit 1
44
+ end
45
+ elsif node[0] == :p ## paragraph with (text) lines
46
+ lines = node[1]
47
+ ## note: skip lines if no heading seen
48
+ if secs.empty?
49
+ puts "** !!! WARN - skipping lines (no heading):"
50
+ pp lines
51
+ else
52
+ ## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
53
+ secs[-1][:lines] += lines
54
+ end
55
+ else
56
+ puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
57
+ pp node
58
+ exit 1
59
+ end
60
+ end
61
+
62
+
63
+ ## pass 2 - filter seasons if filter present
64
+ if season
65
+ filtered_secs = []
66
+ filter = norm_seasons( season )
67
+ secs.each do |sec|
68
+ if filter.include?( Import::Season.new( sec[:season] ).key )
69
+ filtered_secs << sec
70
+ else
71
+ puts " skipping season >#{sec[:season]}< NOT matched by filter"
72
+ end
73
+ end
74
+ secs = filtered_secs
75
+ end
76
+
77
+ ## pass 3 - check & map; replace inline (string with data struct record)
78
+ secs.each do |sec|
79
+ sec[:season] = Import::Season.new( sec[:season ] )
80
+ sec[:league] = catalog.leagues.find!( sec[:league] )
81
+
82
+ check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
83
+ end
84
+
85
+ secs
86
+ end # method parse
87
+
88
+
89
+
90
+ def catalog() Import.catalog; end ## shortcut convenience helper
91
+
92
+ ## split into league + season
93
+ ## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
94
+ ## World Cup 2018
95
+ LEAGUE_SEASON_HEADING_RE = %r{^
96
+ (?<league>.+?) ## non-greedy
97
+ \s+
98
+ (?<season>\d{4}
99
+ (?:[\/-]\d{1,4})? ## optional 2nd year in season
100
+ )
101
+ $}x
102
+
103
+ def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
104
+ seasons = if season_or_seasons.is_a?( String ) ## wrap in array
105
+ [season_or_seasons]
106
+ else ## assume it's an array already
107
+ season_or_seasons
108
+ end
109
+
110
+ seasons.map { |season| Import::Season.new( season ).key }
111
+ end
112
+
113
+
114
+ def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
115
+ ## split into league / stage / ... e.g.
116
+ ## => Österr. Bundesliga 2018/19, Regular Season
117
+ ## => Österr. Bundesliga 2018/19, Championship Round
118
+ ## etc.
119
+ values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
120
+ values = values.map { |value| value.strip } ## remove all whitespaces
121
+ values
122
+ end
123
+
124
+ def check_stage( name )
125
+ known_stages = ['regular season',
126
+ 'championship round',
127
+ 'relegation round',
128
+ 'play-offs'
129
+ ]
130
+
131
+ if known_stages.include?( name.downcase )
132
+ ## everything ok
133
+ else
134
+ puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
135
+ exit 1
136
+ end
137
+ end
138
+
139
+ end # class LeagueOutlineReader
140
+
141
+ end # module SportDb
@@ -0,0 +1,162 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class LeagueReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
15
+ parse( txt )
16
+ end
17
+
18
+ def self.parse( txt )
19
+ new( txt ).parse
20
+ end
21
+
22
+
23
+
24
+ include Logging
25
+
26
+ def initialize( txt )
27
+ @txt = txt
28
+ end
29
+
30
+ def parse
31
+ recs = []
32
+ last_rec = nil
33
+
34
+ country = nil # last country
35
+ intl = false # is international (league/tournament/cup/competition)
36
+ clubs = true # or clubs|national teams
37
+
38
+ OutlineReader.parse( @txt ).each do |node|
39
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
40
+ heading_level = node[0][1].to_i
41
+ heading = node[1]
42
+
43
+ logger.debug "heading #{heading_level} >#{heading}<"
44
+
45
+ if heading_level != 1
46
+ puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
47
+ pp line
48
+ exit 1
49
+ else
50
+ logger.debug "heading (#{heading_level}) >#{heading}<"
51
+ last_heading = heading
52
+ ## map to country or international / int'l or national teams
53
+ if heading =~ /national team/i ## national team tournament
54
+ country = nil
55
+ intl = true
56
+ clubs = false
57
+ elsif heading =~ /international|int'l/i ## int'l club tournament
58
+ country = nil
59
+ intl = true
60
+ clubs = true
61
+ else
62
+ ## assume country in heading; allow all "formats" supported by parse e.g.
63
+ ## Österreich • Austria (at)
64
+ ## Österreich • Austria
65
+ ## Austria
66
+ ## Deutschland (de) • Germany
67
+ country = catalog.countries.parse( heading )
68
+ intl = false
69
+ clubs = true
70
+
71
+ ## check country code - MUST exist for now!!!!
72
+ if country.nil?
73
+ puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
74
+ exit 1
75
+ end
76
+ end
77
+ end
78
+ elsif node[0] == :p ## paragraph with (text) lines
79
+ lines = node[1]
80
+ lines.each do |line|
81
+
82
+ if line.start_with?( '|' )
83
+ ## assume continuation with line of alternative names
84
+ ## note: skip leading pipe
85
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
86
+ ## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
87
+ ## 2) strip and squish (white)spaces
88
+ # e.g. New York FC (2011-) => New York FC (2011-)
89
+ values = values.map { |value| value.gsub( '$', '' )
90
+ .gsub( /[ \t]+/, ' ' )
91
+ .strip }
92
+ logger.debug "alt_names: #{values.join( '|' )}"
93
+
94
+ last_rec.alt_names += values
95
+ else
96
+ ## assume "regular" line
97
+ ## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
98
+ if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
99
+ league_key = $1
100
+ ## 1) strip (commercial) sponsor markers/tags e.g $$
101
+ ## 2) strip and squish (white)spaces
102
+ league_name = $2.gsub( '$', '' )
103
+ .gsub( /[ \t]+/, ' ' )
104
+ .strip
105
+
106
+ logger.debug "key: >#{league_key}<, name: >#{league_name}<"
107
+
108
+
109
+ alt_names_auto = []
110
+ if country
111
+ alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
112
+ ## todo/check: add "hack" for cl (chile) and exclude?
113
+ ## add a list of (auto-)excluded country codes with conflicts? why? why not?
114
+ ## cl - a) Chile b) Champions League
115
+ alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
116
+ if country.key.upcase != country.fifa
117
+ alt_names_auto << "#{country.fifa} #{league_key.upcase.gsub('.', ' ')}"
118
+ alt_names_auto << "#{country.fifa}" if league_key == '1' ## add shortcut for top level 1 (just country key)
119
+ end
120
+ alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
121
+ else ## assume int'l (no country) e.g. champions league, etc.
122
+ ## only auto-add key (e.g. CL, EL, etc.)
123
+ alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
124
+ end
125
+
126
+ pp alt_names_auto
127
+
128
+ ## prepend country key/code if country present
129
+ ## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
130
+ ## why? lets you "overwrite" key if desired - use it - why? why not?
131
+ if country
132
+ league_key = "#{country.key}.#{league_key}"
133
+ end
134
+
135
+ rec = League.new( key: league_key,
136
+ name: league_name,
137
+ alt_names_auto: alt_names_auto,
138
+ country: country,
139
+ intl: intl,
140
+ clubs: clubs)
141
+ recs << rec
142
+ last_rec = rec
143
+ else
144
+ puts "** !!! ERROR !!! missing key for (canonical) league name"
145
+ exit 1
146
+ end
147
+ end
148
+ end # each line
149
+ else
150
+ puts "** !!! ERROR !!! [league reader] - unknown line type:"
151
+ pp node
152
+ exit 1
153
+ end
154
+ ## pp line
155
+ end
156
+ recs
157
+ end # method parse
158
+
159
+ end # class LeagueReader
160
+
161
+ end ## module Import
162
+ end ## module SportDb