sportdb-formats 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +21 -0
- data/lib/sportdb/formats.rb +63 -0
- data/lib/sportdb/formats/country/country_index.rb +192 -0
- data/lib/sportdb/formats/country/country_reader.rb +122 -0
- data/lib/sportdb/formats/league/league_index.rb +174 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +141 -0
- data/lib/sportdb/formats/league/league_reader.rb +162 -0
- data/lib/sportdb/formats/team/club_index.rb +336 -0
- data/lib/sportdb/formats/team/club_reader.rb +350 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +75 -0
- data/lib/sportdb/formats/team/national_team_index.rb +114 -0
- data/lib/sportdb/formats/team/team_index.rb +43 -0
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -0
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/helper.rb +72 -0
- data/test/test_club_index.rb +183 -0
- data/test/test_club_reader.rb +201 -0
- data/test/test_club_reader_props.rb +54 -0
- data/test/test_country_index.rb +63 -0
- data/test/test_country_reader.rb +59 -0
- data/test/test_league_index.rb +157 -0
- data/test/test_league_outline_reader.rb +55 -0
- data/test/test_league_reader.rb +72 -0
- data/test/test_regex.rb +49 -0
- data/test/test_wiki_reader.rb +77 -0
- metadata +22 -1
@@ -0,0 +1,174 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
class LeagueIndex
|
7
|
+
|
8
|
+
def self.build( path )
|
9
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
10
|
+
|
11
|
+
recs = []
|
12
|
+
pack.each_leagues do |entry|
|
13
|
+
recs += League.parse( entry.read )
|
14
|
+
end
|
15
|
+
recs
|
16
|
+
|
17
|
+
leagues = new
|
18
|
+
leagues.add( recs )
|
19
|
+
leagues
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def catalog() Import.catalog; end
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
@leagues = [] ## leagues by canonical name
|
27
|
+
@leagues_by_name = {}
|
28
|
+
@errors = []
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :errors
|
32
|
+
def errors?() @errors.empty? == false; end
|
33
|
+
|
34
|
+
def mappings() @leagues_by_name; end ## todo/check: rename to index or something - why? why not?
|
35
|
+
def leagues() @leagues.values; end
|
36
|
+
alias_method :all, :leagues ## use ActiveRecord-like alias for leagues
|
37
|
+
|
38
|
+
|
39
|
+
## helpers from club - use a helper module for includes - why? why not?
|
40
|
+
include NameHelper
|
41
|
+
## incl. strip_lang( name )
|
42
|
+
## normalize( name )
|
43
|
+
|
44
|
+
|
45
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
46
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
47
|
+
|
48
|
+
recs.each do |rec|
|
49
|
+
## puts "adding:"
|
50
|
+
## pp rec
|
51
|
+
### step 1) add canonical name
|
52
|
+
@leagues << rec
|
53
|
+
|
54
|
+
## step 2) add all names (canonical name + alt names + alt names (auto))
|
55
|
+
names = [rec.name] + rec.alt_names
|
56
|
+
## check for duplicates - simple check for now - fix/improve
|
57
|
+
## todo/fix: (auto)remove duplicates - why? why not?
|
58
|
+
count = names.size
|
59
|
+
count_uniq = names.uniq.size
|
60
|
+
if count != count_uniq
|
61
|
+
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
62
|
+
pp names
|
63
|
+
pp rec
|
64
|
+
exit 1
|
65
|
+
end
|
66
|
+
|
67
|
+
## todo/fix: move alt_names_auto up for check unique names
|
68
|
+
## e.g. remove/avoid auto-generated duplicates ENG 1, AUT 1, etc
|
69
|
+
names += rec.alt_names_auto
|
70
|
+
|
71
|
+
names.each_with_index do |name,i|
|
72
|
+
## check lang codes e.g. [en], [fr], etc.
|
73
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
74
|
+
name = strip_lang( name )
|
75
|
+
norm = normalize( name )
|
76
|
+
alt_recs = @leagues_by_name[ norm ]
|
77
|
+
if alt_recs
|
78
|
+
## check if include club rec already or is new club rec
|
79
|
+
if alt_recs.include?( rec )
|
80
|
+
## note: do NOT include duplicate club record
|
81
|
+
msg = "** !!! WARN !!! - (norm) name conflict/duplicate for league - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
|
82
|
+
puts msg
|
83
|
+
@errors << msg
|
84
|
+
else
|
85
|
+
msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country ? alt_recs[0].country.key : '?'}< with >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
|
86
|
+
puts msg
|
87
|
+
@errors << msg
|
88
|
+
alt_recs << rec
|
89
|
+
end
|
90
|
+
else
|
91
|
+
@leagues_by_name[ norm ] = [rec]
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end # method add
|
96
|
+
|
97
|
+
|
98
|
+
def match( name )
|
99
|
+
## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
|
100
|
+
name = normalize( name )
|
101
|
+
@leagues_by_name[ name ]
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
def match_by( name:, country: )
|
106
|
+
## note: match must for now always include name
|
107
|
+
m = match( name )
|
108
|
+
if m ## filter by country
|
109
|
+
## note: country assumes / allows the country key or fifa code for now
|
110
|
+
|
111
|
+
## note: allow passing in of country struct too
|
112
|
+
country_rec = if country.is_a?( Country )
|
113
|
+
country ## (re)use country struct - no need to run lookup again
|
114
|
+
else
|
115
|
+
## note: use own "global" countries index setting for ClubIndex - why? why not?
|
116
|
+
rec = catalog.countries.find( country )
|
117
|
+
if rec.nil?
|
118
|
+
puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
|
119
|
+
exit 1
|
120
|
+
end
|
121
|
+
rec
|
122
|
+
end
|
123
|
+
|
124
|
+
## note: also skip international leagues & cups (e.g. champions league etc.) for now - why? why not?
|
125
|
+
m = m.select { |league| league.country &&
|
126
|
+
league.country.key == country_rec.key }
|
127
|
+
m = nil if m.empty? ## note: reset to nil if no more matches
|
128
|
+
end
|
129
|
+
m
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
def find!( name )
|
134
|
+
league = find( name )
|
135
|
+
if league.nil?
|
136
|
+
puts "** !!! ERROR - no league match found for >#{name}<, add to leagues table; sorry"
|
137
|
+
exit 1
|
138
|
+
end
|
139
|
+
league
|
140
|
+
end
|
141
|
+
|
142
|
+
def find( name )
|
143
|
+
league = nil
|
144
|
+
m = match( name )
|
145
|
+
# pp m
|
146
|
+
|
147
|
+
if m.nil?
|
148
|
+
## fall through/do nothing
|
149
|
+
elsif m.size > 1
|
150
|
+
puts "** !!! ERROR - ambigious league name; too many leagues (#{m.size}) found:"
|
151
|
+
pp m
|
152
|
+
exit 1
|
153
|
+
else
|
154
|
+
league = m[0]
|
155
|
+
end
|
156
|
+
|
157
|
+
league
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
def dump_duplicates # debug helper - report duplicate club name records
|
164
|
+
@leagues_by_name.each do |name, leagues|
|
165
|
+
if leagues.size > 1
|
166
|
+
puts "#{leagues.size} matching leagues duplicates for >#{name}<:"
|
167
|
+
pp leagues
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end # class LeagueIndex
|
172
|
+
|
173
|
+
end # module Import
|
174
|
+
end # module SportDb
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
|
6
|
+
## shared "higher-level" outline reader
|
7
|
+
## todo: add CountryOutlineReader - why? why not?
|
8
|
+
|
9
|
+
class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
|
10
|
+
|
11
|
+
def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
+
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
13
|
+
parse( txt, season: season )
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.parse( txt, season: nil )
|
17
|
+
new( txt ).parse( season: season )
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def initialize( txt )
|
22
|
+
@txt = txt
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse( season: nil )
|
26
|
+
secs=[] # sec(tion)s
|
27
|
+
OutlineReader.parse( @txt ).each do |node|
|
28
|
+
if node[0] == :h1
|
29
|
+
## check for league (and stage) and season
|
30
|
+
heading = node[1]
|
31
|
+
values = split_league( heading )
|
32
|
+
if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
|
33
|
+
puts "league >#{m[:league]}<, season >#{m[:season]}<"
|
34
|
+
|
35
|
+
secs << { league: m[:league],
|
36
|
+
season: m[:season],
|
37
|
+
stage: values[1], ## note: defaults to nil if not present
|
38
|
+
lines: []
|
39
|
+
}
|
40
|
+
else
|
41
|
+
puts "** !!! ERROR - cannot match league and season in heading; season missing?"
|
42
|
+
pp heading
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
46
|
+
lines = node[1]
|
47
|
+
## note: skip lines if no heading seen
|
48
|
+
if secs.empty?
|
49
|
+
puts "** !!! WARN - skipping lines (no heading):"
|
50
|
+
pp lines
|
51
|
+
else
|
52
|
+
## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
|
53
|
+
secs[-1][:lines] += lines
|
54
|
+
end
|
55
|
+
else
|
56
|
+
puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
|
57
|
+
pp node
|
58
|
+
exit 1
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
## pass 2 - filter seasons if filter present
|
64
|
+
if season
|
65
|
+
filtered_secs = []
|
66
|
+
filter = norm_seasons( season )
|
67
|
+
secs.each do |sec|
|
68
|
+
if filter.include?( Import::Season.new( sec[:season] ).key )
|
69
|
+
filtered_secs << sec
|
70
|
+
else
|
71
|
+
puts " skipping season >#{sec[:season]}< NOT matched by filter"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
secs = filtered_secs
|
75
|
+
end
|
76
|
+
|
77
|
+
## pass 3 - check & map; replace inline (string with data struct record)
|
78
|
+
secs.each do |sec|
|
79
|
+
sec[:season] = Import::Season.new( sec[:season ] )
|
80
|
+
sec[:league] = catalog.leagues.find!( sec[:league] )
|
81
|
+
|
82
|
+
check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
|
83
|
+
end
|
84
|
+
|
85
|
+
secs
|
86
|
+
end # method parse
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
def catalog() Import.catalog; end ## shortcut convenience helper
|
91
|
+
|
92
|
+
## split into league + season
|
93
|
+
## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
|
94
|
+
## World Cup 2018
|
95
|
+
LEAGUE_SEASON_HEADING_RE = %r{^
|
96
|
+
(?<league>.+?) ## non-greedy
|
97
|
+
\s+
|
98
|
+
(?<season>\d{4}
|
99
|
+
(?:[\/-]\d{1,4})? ## optional 2nd year in season
|
100
|
+
)
|
101
|
+
$}x
|
102
|
+
|
103
|
+
def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
|
104
|
+
seasons = if season_or_seasons.is_a?( String ) ## wrap in array
|
105
|
+
[season_or_seasons]
|
106
|
+
else ## assume it's an array already
|
107
|
+
season_or_seasons
|
108
|
+
end
|
109
|
+
|
110
|
+
seasons.map { |season| Import::Season.new( season ).key }
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
|
115
|
+
## split into league / stage / ... e.g.
|
116
|
+
## => Österr. Bundesliga 2018/19, Regular Season
|
117
|
+
## => Österr. Bundesliga 2018/19, Championship Round
|
118
|
+
## etc.
|
119
|
+
values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
|
120
|
+
values = values.map { |value| value.strip } ## remove all whitespaces
|
121
|
+
values
|
122
|
+
end
|
123
|
+
|
124
|
+
def check_stage( name )
|
125
|
+
known_stages = ['regular season',
|
126
|
+
'championship round',
|
127
|
+
'relegation round',
|
128
|
+
'play-offs'
|
129
|
+
]
|
130
|
+
|
131
|
+
if known_stages.include?( name.downcase )
|
132
|
+
## everything ok
|
133
|
+
else
|
134
|
+
puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
|
135
|
+
exit 1
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end # class LeagueOutlineReader
|
140
|
+
|
141
|
+
end # module SportDb
|
@@ -0,0 +1,162 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class LeagueReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
15
|
+
parse( txt )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.parse( txt )
|
19
|
+
new( txt ).parse
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
include Logging
|
25
|
+
|
26
|
+
def initialize( txt )
|
27
|
+
@txt = txt
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse
|
31
|
+
recs = []
|
32
|
+
last_rec = nil
|
33
|
+
|
34
|
+
country = nil # last country
|
35
|
+
intl = false # is international (league/tournament/cup/competition)
|
36
|
+
clubs = true # or clubs|national teams
|
37
|
+
|
38
|
+
OutlineReader.parse( @txt ).each do |node|
|
39
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
40
|
+
heading_level = node[0][1].to_i
|
41
|
+
heading = node[1]
|
42
|
+
|
43
|
+
logger.debug "heading #{heading_level} >#{heading}<"
|
44
|
+
|
45
|
+
if heading_level != 1
|
46
|
+
puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
|
47
|
+
pp line
|
48
|
+
exit 1
|
49
|
+
else
|
50
|
+
logger.debug "heading (#{heading_level}) >#{heading}<"
|
51
|
+
last_heading = heading
|
52
|
+
## map to country or international / int'l or national teams
|
53
|
+
if heading =~ /national team/i ## national team tournament
|
54
|
+
country = nil
|
55
|
+
intl = true
|
56
|
+
clubs = false
|
57
|
+
elsif heading =~ /international|int'l/i ## int'l club tournament
|
58
|
+
country = nil
|
59
|
+
intl = true
|
60
|
+
clubs = true
|
61
|
+
else
|
62
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
63
|
+
## Österreich • Austria (at)
|
64
|
+
## Österreich • Austria
|
65
|
+
## Austria
|
66
|
+
## Deutschland (de) • Germany
|
67
|
+
country = catalog.countries.parse( heading )
|
68
|
+
intl = false
|
69
|
+
clubs = true
|
70
|
+
|
71
|
+
## check country code - MUST exist for now!!!!
|
72
|
+
if country.nil?
|
73
|
+
puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
79
|
+
lines = node[1]
|
80
|
+
lines.each do |line|
|
81
|
+
|
82
|
+
if line.start_with?( '|' )
|
83
|
+
## assume continuation with line of alternative names
|
84
|
+
## note: skip leading pipe
|
85
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
86
|
+
## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
|
87
|
+
## 2) strip and squish (white)spaces
|
88
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
89
|
+
values = values.map { |value| value.gsub( '$', '' )
|
90
|
+
.gsub( /[ \t]+/, ' ' )
|
91
|
+
.strip }
|
92
|
+
logger.debug "alt_names: #{values.join( '|' )}"
|
93
|
+
|
94
|
+
last_rec.alt_names += values
|
95
|
+
else
|
96
|
+
## assume "regular" line
|
97
|
+
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
98
|
+
if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
|
99
|
+
league_key = $1
|
100
|
+
## 1) strip (commercial) sponsor markers/tags e.g $$
|
101
|
+
## 2) strip and squish (white)spaces
|
102
|
+
league_name = $2.gsub( '$', '' )
|
103
|
+
.gsub( /[ \t]+/, ' ' )
|
104
|
+
.strip
|
105
|
+
|
106
|
+
logger.debug "key: >#{league_key}<, name: >#{league_name}<"
|
107
|
+
|
108
|
+
|
109
|
+
alt_names_auto = []
|
110
|
+
if country
|
111
|
+
alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
|
112
|
+
## todo/check: add "hack" for cl (chile) and exclude?
|
113
|
+
## add a list of (auto-)excluded country codes with conflicts? why? why not?
|
114
|
+
## cl - a) Chile b) Champions League
|
115
|
+
alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
116
|
+
if country.key.upcase != country.fifa
|
117
|
+
alt_names_auto << "#{country.fifa} #{league_key.upcase.gsub('.', ' ')}"
|
118
|
+
alt_names_auto << "#{country.fifa}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
119
|
+
end
|
120
|
+
alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
|
121
|
+
else ## assume int'l (no country) e.g. champions league, etc.
|
122
|
+
## only auto-add key (e.g. CL, EL, etc.)
|
123
|
+
alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
|
124
|
+
end
|
125
|
+
|
126
|
+
pp alt_names_auto
|
127
|
+
|
128
|
+
## prepend country key/code if country present
|
129
|
+
## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
|
130
|
+
## why? lets you "overwrite" key if desired - use it - why? why not?
|
131
|
+
if country
|
132
|
+
league_key = "#{country.key}.#{league_key}"
|
133
|
+
end
|
134
|
+
|
135
|
+
rec = League.new( key: league_key,
|
136
|
+
name: league_name,
|
137
|
+
alt_names_auto: alt_names_auto,
|
138
|
+
country: country,
|
139
|
+
intl: intl,
|
140
|
+
clubs: clubs)
|
141
|
+
recs << rec
|
142
|
+
last_rec = rec
|
143
|
+
else
|
144
|
+
puts "** !!! ERROR !!! missing key for (canonical) league name"
|
145
|
+
exit 1
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end # each line
|
149
|
+
else
|
150
|
+
puts "** !!! ERROR !!! [league reader] - unknown line type:"
|
151
|
+
pp node
|
152
|
+
exit 1
|
153
|
+
end
|
154
|
+
## pp line
|
155
|
+
end
|
156
|
+
recs
|
157
|
+
end # method parse
|
158
|
+
|
159
|
+
end # class LeagueReader
|
160
|
+
|
161
|
+
end ## module Import
|
162
|
+
end ## module SportDb
|