sportdb-formats 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +21 -0
- data/lib/sportdb/formats.rb +63 -0
- data/lib/sportdb/formats/country/country_index.rb +192 -0
- data/lib/sportdb/formats/country/country_reader.rb +122 -0
- data/lib/sportdb/formats/league/league_index.rb +174 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +141 -0
- data/lib/sportdb/formats/league/league_reader.rb +162 -0
- data/lib/sportdb/formats/team/club_index.rb +336 -0
- data/lib/sportdb/formats/team/club_reader.rb +350 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +75 -0
- data/lib/sportdb/formats/team/national_team_index.rb +114 -0
- data/lib/sportdb/formats/team/team_index.rb +43 -0
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -0
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/helper.rb +72 -0
- data/test/test_club_index.rb +183 -0
- data/test/test_club_reader.rb +201 -0
- data/test/test_club_reader_props.rb +54 -0
- data/test/test_country_index.rb +63 -0
- data/test/test_country_reader.rb +59 -0
- data/test/test_league_index.rb +157 -0
- data/test/test_league_outline_reader.rb +55 -0
- data/test/test_league_reader.rb +72 -0
- data/test/test_regex.rb +49 -0
- data/test/test_wiki_reader.rb +77 -0
- metadata +22 -1
@@ -0,0 +1,174 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
class LeagueIndex
|
7
|
+
|
8
|
+
def self.build( path )
|
9
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
10
|
+
|
11
|
+
recs = []
|
12
|
+
pack.each_leagues do |entry|
|
13
|
+
recs += League.parse( entry.read )
|
14
|
+
end
|
15
|
+
recs
|
16
|
+
|
17
|
+
leagues = new
|
18
|
+
leagues.add( recs )
|
19
|
+
leagues
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def catalog() Import.catalog; end
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
@leagues = [] ## leagues by canonical name
|
27
|
+
@leagues_by_name = {}
|
28
|
+
@errors = []
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :errors
|
32
|
+
def errors?() @errors.empty? == false; end
|
33
|
+
|
34
|
+
def mappings() @leagues_by_name; end ## todo/check: rename to index or something - why? why not?
|
35
|
+
def leagues() @leagues.values; end
|
36
|
+
alias_method :all, :leagues ## use ActiveRecord-like alias for leagues
|
37
|
+
|
38
|
+
|
39
|
+
## helpers from club - use a helper module for includes - why? why not?
|
40
|
+
include NameHelper
|
41
|
+
## incl. strip_lang( name )
|
42
|
+
## normalize( name )
|
43
|
+
|
44
|
+
|
45
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
46
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
47
|
+
|
48
|
+
recs.each do |rec|
|
49
|
+
## puts "adding:"
|
50
|
+
## pp rec
|
51
|
+
### step 1) add canonical name
|
52
|
+
@leagues << rec
|
53
|
+
|
54
|
+
## step 2) add all names (canonical name + alt names + alt names (auto))
|
55
|
+
names = [rec.name] + rec.alt_names
|
56
|
+
## check for duplicates - simple check for now - fix/improve
|
57
|
+
## todo/fix: (auto)remove duplicates - why? why not?
|
58
|
+
count = names.size
|
59
|
+
count_uniq = names.uniq.size
|
60
|
+
if count != count_uniq
|
61
|
+
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
62
|
+
pp names
|
63
|
+
pp rec
|
64
|
+
exit 1
|
65
|
+
end
|
66
|
+
|
67
|
+
## todo/fix: move alt_names_auto up for check unique names
|
68
|
+
## e.g. remove/avoid auto-generated duplicates ENG 1, AUT 1, etc
|
69
|
+
names += rec.alt_names_auto
|
70
|
+
|
71
|
+
names.each_with_index do |name,i|
|
72
|
+
## check lang codes e.g. [en], [fr], etc.
|
73
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
74
|
+
name = strip_lang( name )
|
75
|
+
norm = normalize( name )
|
76
|
+
alt_recs = @leagues_by_name[ norm ]
|
77
|
+
if alt_recs
|
78
|
+
## check if include club rec already or is new club rec
|
79
|
+
if alt_recs.include?( rec )
|
80
|
+
## note: do NOT include duplicate club record
|
81
|
+
msg = "** !!! WARN !!! - (norm) name conflict/duplicate for league - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
|
82
|
+
puts msg
|
83
|
+
@errors << msg
|
84
|
+
else
|
85
|
+
msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country ? alt_recs[0].country.key : '?'}< with >#{rec.name}, #{rec.country ? rec.country.key : '?'}<"
|
86
|
+
puts msg
|
87
|
+
@errors << msg
|
88
|
+
alt_recs << rec
|
89
|
+
end
|
90
|
+
else
|
91
|
+
@leagues_by_name[ norm ] = [rec]
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end # method add
|
96
|
+
|
97
|
+
|
98
|
+
def match( name )
|
99
|
+
## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
|
100
|
+
name = normalize( name )
|
101
|
+
@leagues_by_name[ name ]
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
def match_by( name:, country: )
|
106
|
+
## note: match must for now always include name
|
107
|
+
m = match( name )
|
108
|
+
if m ## filter by country
|
109
|
+
## note: country assumes / allows the country key or fifa code for now
|
110
|
+
|
111
|
+
## note: allow passing in of country struct too
|
112
|
+
country_rec = if country.is_a?( Country )
|
113
|
+
country ## (re)use country struct - no need to run lookup again
|
114
|
+
else
|
115
|
+
## note: use own "global" countries index setting for ClubIndex - why? why not?
|
116
|
+
rec = catalog.countries.find( country )
|
117
|
+
if rec.nil?
|
118
|
+
puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
|
119
|
+
exit 1
|
120
|
+
end
|
121
|
+
rec
|
122
|
+
end
|
123
|
+
|
124
|
+
## note: also skip international leagues & cups (e.g. champions league etc.) for now - why? why not?
|
125
|
+
m = m.select { |league| league.country &&
|
126
|
+
league.country.key == country_rec.key }
|
127
|
+
m = nil if m.empty? ## note: reset to nil if no more matches
|
128
|
+
end
|
129
|
+
m
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
def find!( name )
|
134
|
+
league = find( name )
|
135
|
+
if league.nil?
|
136
|
+
puts "** !!! ERROR - no league match found for >#{name}<, add to leagues table; sorry"
|
137
|
+
exit 1
|
138
|
+
end
|
139
|
+
league
|
140
|
+
end
|
141
|
+
|
142
|
+
def find( name )
|
143
|
+
league = nil
|
144
|
+
m = match( name )
|
145
|
+
# pp m
|
146
|
+
|
147
|
+
if m.nil?
|
148
|
+
## fall through/do nothing
|
149
|
+
elsif m.size > 1
|
150
|
+
puts "** !!! ERROR - ambigious league name; too many leagues (#{m.size}) found:"
|
151
|
+
pp m
|
152
|
+
exit 1
|
153
|
+
else
|
154
|
+
league = m[0]
|
155
|
+
end
|
156
|
+
|
157
|
+
league
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
def dump_duplicates # debug helper - report duplicate club name records
|
164
|
+
@leagues_by_name.each do |name, leagues|
|
165
|
+
if leagues.size > 1
|
166
|
+
puts "#{leagues.size} matching leagues duplicates for >#{name}<:"
|
167
|
+
pp leagues
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end # class LeagueIndex
|
172
|
+
|
173
|
+
end # module Import
|
174
|
+
end # module SportDb
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
|
6
|
+
## shared "higher-level" outline reader
|
7
|
+
## todo: add CountryOutlineReader - why? why not?
|
8
|
+
|
9
|
+
class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
|
10
|
+
|
11
|
+
def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
+
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
13
|
+
parse( txt, season: season )
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.parse( txt, season: nil )
|
17
|
+
new( txt ).parse( season: season )
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def initialize( txt )
|
22
|
+
@txt = txt
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse( season: nil )
|
26
|
+
secs=[] # sec(tion)s
|
27
|
+
OutlineReader.parse( @txt ).each do |node|
|
28
|
+
if node[0] == :h1
|
29
|
+
## check for league (and stage) and season
|
30
|
+
heading = node[1]
|
31
|
+
values = split_league( heading )
|
32
|
+
if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
|
33
|
+
puts "league >#{m[:league]}<, season >#{m[:season]}<"
|
34
|
+
|
35
|
+
secs << { league: m[:league],
|
36
|
+
season: m[:season],
|
37
|
+
stage: values[1], ## note: defaults to nil if not present
|
38
|
+
lines: []
|
39
|
+
}
|
40
|
+
else
|
41
|
+
puts "** !!! ERROR - cannot match league and season in heading; season missing?"
|
42
|
+
pp heading
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
46
|
+
lines = node[1]
|
47
|
+
## note: skip lines if no heading seen
|
48
|
+
if secs.empty?
|
49
|
+
puts "** !!! WARN - skipping lines (no heading):"
|
50
|
+
pp lines
|
51
|
+
else
|
52
|
+
## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
|
53
|
+
secs[-1][:lines] += lines
|
54
|
+
end
|
55
|
+
else
|
56
|
+
puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
|
57
|
+
pp node
|
58
|
+
exit 1
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
## pass 2 - filter seasons if filter present
|
64
|
+
if season
|
65
|
+
filtered_secs = []
|
66
|
+
filter = norm_seasons( season )
|
67
|
+
secs.each do |sec|
|
68
|
+
if filter.include?( Import::Season.new( sec[:season] ).key )
|
69
|
+
filtered_secs << sec
|
70
|
+
else
|
71
|
+
puts " skipping season >#{sec[:season]}< NOT matched by filter"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
secs = filtered_secs
|
75
|
+
end
|
76
|
+
|
77
|
+
## pass 3 - check & map; replace inline (string with data struct record)
|
78
|
+
secs.each do |sec|
|
79
|
+
sec[:season] = Import::Season.new( sec[:season ] )
|
80
|
+
sec[:league] = catalog.leagues.find!( sec[:league] )
|
81
|
+
|
82
|
+
check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
|
83
|
+
end
|
84
|
+
|
85
|
+
secs
|
86
|
+
end # method parse
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
def catalog() Import.catalog; end ## shortcut convenience helper
|
91
|
+
|
92
|
+
## split into league + season
|
93
|
+
## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
|
94
|
+
## World Cup 2018
|
95
|
+
LEAGUE_SEASON_HEADING_RE = %r{^
|
96
|
+
(?<league>.+?) ## non-greedy
|
97
|
+
\s+
|
98
|
+
(?<season>\d{4}
|
99
|
+
(?:[\/-]\d{1,4})? ## optional 2nd year in season
|
100
|
+
)
|
101
|
+
$}x
|
102
|
+
|
103
|
+
def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
|
104
|
+
seasons = if season_or_seasons.is_a?( String ) ## wrap in array
|
105
|
+
[season_or_seasons]
|
106
|
+
else ## assume it's an array already
|
107
|
+
season_or_seasons
|
108
|
+
end
|
109
|
+
|
110
|
+
seasons.map { |season| Import::Season.new( season ).key }
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
|
115
|
+
## split into league / stage / ... e.g.
|
116
|
+
## => Österr. Bundesliga 2018/19, Regular Season
|
117
|
+
## => Österr. Bundesliga 2018/19, Championship Round
|
118
|
+
## etc.
|
119
|
+
values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
|
120
|
+
values = values.map { |value| value.strip } ## remove all whitespaces
|
121
|
+
values
|
122
|
+
end
|
123
|
+
|
124
|
+
def check_stage( name )
|
125
|
+
known_stages = ['regular season',
|
126
|
+
'championship round',
|
127
|
+
'relegation round',
|
128
|
+
'play-offs'
|
129
|
+
]
|
130
|
+
|
131
|
+
if known_stages.include?( name.downcase )
|
132
|
+
## everything ok
|
133
|
+
else
|
134
|
+
puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
|
135
|
+
exit 1
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end # class LeagueOutlineReader
|
140
|
+
|
141
|
+
end # module SportDb
|
@@ -0,0 +1,162 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class LeagueReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
15
|
+
parse( txt )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.parse( txt )
|
19
|
+
new( txt ).parse
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
include Logging
|
25
|
+
|
26
|
+
def initialize( txt )
|
27
|
+
@txt = txt
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse
|
31
|
+
recs = []
|
32
|
+
last_rec = nil
|
33
|
+
|
34
|
+
country = nil # last country
|
35
|
+
intl = false # is international (league/tournament/cup/competition)
|
36
|
+
clubs = true # or clubs|national teams
|
37
|
+
|
38
|
+
OutlineReader.parse( @txt ).each do |node|
|
39
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
40
|
+
heading_level = node[0][1].to_i
|
41
|
+
heading = node[1]
|
42
|
+
|
43
|
+
logger.debug "heading #{heading_level} >#{heading}<"
|
44
|
+
|
45
|
+
if heading_level != 1
|
46
|
+
puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
|
47
|
+
pp line
|
48
|
+
exit 1
|
49
|
+
else
|
50
|
+
logger.debug "heading (#{heading_level}) >#{heading}<"
|
51
|
+
last_heading = heading
|
52
|
+
## map to country or international / int'l or national teams
|
53
|
+
if heading =~ /national team/i ## national team tournament
|
54
|
+
country = nil
|
55
|
+
intl = true
|
56
|
+
clubs = false
|
57
|
+
elsif heading =~ /international|int'l/i ## int'l club tournament
|
58
|
+
country = nil
|
59
|
+
intl = true
|
60
|
+
clubs = true
|
61
|
+
else
|
62
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
63
|
+
## Österreich • Austria (at)
|
64
|
+
## Österreich • Austria
|
65
|
+
## Austria
|
66
|
+
## Deutschland (de) • Germany
|
67
|
+
country = catalog.countries.parse( heading )
|
68
|
+
intl = false
|
69
|
+
clubs = true
|
70
|
+
|
71
|
+
## check country code - MUST exist for now!!!!
|
72
|
+
if country.nil?
|
73
|
+
puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
79
|
+
lines = node[1]
|
80
|
+
lines.each do |line|
|
81
|
+
|
82
|
+
if line.start_with?( '|' )
|
83
|
+
## assume continuation with line of alternative names
|
84
|
+
## note: skip leading pipe
|
85
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
86
|
+
## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
|
87
|
+
## 2) strip and squish (white)spaces
|
88
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
89
|
+
values = values.map { |value| value.gsub( '$', '' )
|
90
|
+
.gsub( /[ \t]+/, ' ' )
|
91
|
+
.strip }
|
92
|
+
logger.debug "alt_names: #{values.join( '|' )}"
|
93
|
+
|
94
|
+
last_rec.alt_names += values
|
95
|
+
else
|
96
|
+
## assume "regular" line
|
97
|
+
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
98
|
+
if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
|
99
|
+
league_key = $1
|
100
|
+
## 1) strip (commercial) sponsor markers/tags e.g $$
|
101
|
+
## 2) strip and squish (white)spaces
|
102
|
+
league_name = $2.gsub( '$', '' )
|
103
|
+
.gsub( /[ \t]+/, ' ' )
|
104
|
+
.strip
|
105
|
+
|
106
|
+
logger.debug "key: >#{league_key}<, name: >#{league_name}<"
|
107
|
+
|
108
|
+
|
109
|
+
alt_names_auto = []
|
110
|
+
if country
|
111
|
+
alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
|
112
|
+
## todo/check: add "hack" for cl (chile) and exclude?
|
113
|
+
## add a list of (auto-)excluded country codes with conflicts? why? why not?
|
114
|
+
## cl - a) Chile b) Champions League
|
115
|
+
alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
116
|
+
if country.key.upcase != country.fifa
|
117
|
+
alt_names_auto << "#{country.fifa} #{league_key.upcase.gsub('.', ' ')}"
|
118
|
+
alt_names_auto << "#{country.fifa}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
119
|
+
end
|
120
|
+
alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
|
121
|
+
else ## assume int'l (no country) e.g. champions league, etc.
|
122
|
+
## only auto-add key (e.g. CL, EL, etc.)
|
123
|
+
alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
|
124
|
+
end
|
125
|
+
|
126
|
+
pp alt_names_auto
|
127
|
+
|
128
|
+
## prepend country key/code if country present
|
129
|
+
## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
|
130
|
+
## why? lets you "overwrite" key if desired - use it - why? why not?
|
131
|
+
if country
|
132
|
+
league_key = "#{country.key}.#{league_key}"
|
133
|
+
end
|
134
|
+
|
135
|
+
rec = League.new( key: league_key,
|
136
|
+
name: league_name,
|
137
|
+
alt_names_auto: alt_names_auto,
|
138
|
+
country: country,
|
139
|
+
intl: intl,
|
140
|
+
clubs: clubs)
|
141
|
+
recs << rec
|
142
|
+
last_rec = rec
|
143
|
+
else
|
144
|
+
puts "** !!! ERROR !!! missing key for (canonical) league name"
|
145
|
+
exit 1
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end # each line
|
149
|
+
else
|
150
|
+
puts "** !!! ERROR !!! [league reader] - unknown line type:"
|
151
|
+
pp node
|
152
|
+
exit 1
|
153
|
+
end
|
154
|
+
## pp line
|
155
|
+
end
|
156
|
+
recs
|
157
|
+
end # method parse
|
158
|
+
|
159
|
+
end # class LeagueReader
|
160
|
+
|
161
|
+
end ## module Import
|
162
|
+
end ## module SportDb
|