sportdb-config 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -55,23 +55,54 @@ class Club
55
55
  end
56
56
  end
57
57
 
58
+ ###################################
59
+ # "global" helper - move to ___ ? why? why not?
60
+
61
+ YEAR_REGEX = /\([0-9,\- ]+?\)/
62
+ def self.strip_year( name )
63
+ ## check for year(s) e.g. (1887-1911), (-2013),
64
+ ## (1946-2001, 2013-) etc.
65
+ name.gsub( YEAR_REGEX, '' ).strip
66
+ end
67
+
68
+ def self.has_year?( name ) name =~ YEAR_REGEX; end
69
+
70
+ LANG_REGEX = /\[[a-z]{2}\]/
71
+ def self.strip_lang( name )
72
+ name.gsub( LANG_REGEX, '' ).strip
73
+ end
74
+
75
+ def self.has_lang?( name ) name =~ LANG_REGEX; end
76
+
77
+ NORM_REGEX = /[.'º\-\/]/
78
+ ## note: remove all dots (.), dash (-), ', º, /, etc.
79
+ ## for norm(alizing) names
80
+ def self.strip_norm( name )
81
+ name.gsub( NORM_REGEX, '' )
82
+ end
83
+
84
+ def strip_year( name ) self.class.strip_year( name ); end
85
+ def strip_lang( name ) self.class.strip_lang( name ); end
86
+ def strip_norm( name ) self.class.strip_norm( name ); end
87
+
58
88
  private
59
89
  def sanitize( name )
60
- ## check for year(s) e.g. (1887-1911), (-2013) etc.
61
- name = name.gsub( /\([0-9\- ]+?\)/, '' ).strip
90
+ ## check for year(s) e.g. (1887-1911), (-2013),
91
+ ## (1946-2001,2013-) etc.
92
+ name = strip_year( name )
62
93
  ## check lang codes e.g. [en], [fr], etc.
63
- name = name.gsub( /\[[a-z]{2}\]/, '' ).strip
94
+ name = strip_lang( name )
64
95
  name
65
96
  end
66
97
 
67
98
  def normalize( name )
68
- name = sanitize( name)
99
+ name = sanitize( name )
69
100
 
70
- name = name.gsub( '.', '' ) # remove all dots
71
- ## don't report duplicates that only differ with dash (-)
72
- ## e.g. Al Ahly or Al-Ahly for now - why? why not?
73
- name = name.gsub( '-', ' ' ) # replace - with space (e.g. - same as space)
101
+ ## remove all dots (.), dash (-), º, /, etc.
102
+ name = strip_norm( name )
103
+ name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
74
104
 
105
+ ## todo/fix: use our own downcase - why? why not?
75
106
  name = name.downcase ## do NOT care about upper and lowercase for now
76
107
  name
77
108
  end
@@ -97,6 +128,13 @@ class ClubIndex
97
128
  def clubs() @clubs.values; end
98
129
 
99
130
 
131
+ ## helpers from club - use a helper module for includes - why? why not?
132
+ def strip_year( name ) Club.strip_year( name ); end
133
+ def has_year?( name) Club.has_year?( name ); end
134
+ def strip_lang( name ) Club.strip_lang( name ); end
135
+ def strip_norm( name ) Club.strip_norm( name ); end
136
+
137
+
100
138
 
101
139
  def add( rec_or_recs ) ## add club record / alt_names
102
140
  recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
@@ -119,10 +157,11 @@ class ClubIndex
119
157
  names = [rec.name] + rec.alt_names
120
158
  more_names = []
121
159
  ## check "hand-typed" names for year (auto-add)
122
- ## check for year(s) e.g. (1887-1911), (-2013) etc.
160
+ ## check for year(s) e.g. (1887-1911), (-2013),
161
+ ## (1946-2001,2013-) etc.
123
162
  names.each do |name|
124
- if name =~ /\([0-9\- ]+?\)/
125
- more_names << name.gsub( /\([0-9\- ]+?\)/, '' ).strip
163
+ if has_year?( name )
164
+ more_names << strip_year( name )
126
165
  end
127
166
  end
128
167
 
@@ -151,7 +190,7 @@ class ClubIndex
151
190
 
152
191
  names.each_with_index do |name,i|
153
192
  ## check lang codes e.g. [en], [fr], etc.
154
- name = name.gsub( /\[[a-z]{2}\]/, '' ).strip
193
+ name = strip_lang( name )
155
194
  norm = normalize( name )
156
195
  alt_recs = @clubs_by_name[ norm ]
157
196
  if alt_recs
@@ -206,17 +245,6 @@ class ClubIndex
206
245
 
207
246
 
208
247
  def dump_duplicates # debug helper - report duplicate club name records
209
-
210
- ## todo/fix: remove club.duplicates - alreay included in reports -see TeamDuplicatePart
211
- ## more a "feature" of Clubs than ClubIndex class - why? why not?
212
- @clubs.values.each do |club|
213
- if club.duplicates?
214
- duplicates = club.duplicates
215
- puts "#{duplicates.size} (norm) name duplicate(s) for #{club.name}, #{club.country.name}:"
216
- pp duplicates
217
- end
218
- end
219
-
220
248
  @clubs_by_name.each do |name, clubs|
221
249
  if clubs.size > 1
222
250
  puts "#{clubs.size} matching club duplicates for >#{name}<:"
@@ -229,11 +257,10 @@ class ClubIndex
229
257
 
230
258
  private
231
259
  def normalize( name )
232
- name = name.gsub( '.', '' ) # remove all dots
233
- ## don't report duplicates that only differ with dash (-)
234
- ## e.g. Al Ahly or Al-Ahly for now - why? why not?
235
- name = name.gsub( '-', ' ' ) # replace - with space (e.g. - same as space)
260
+ name = strip_norm( name )
261
+ name = name.gsub( ' ', '' ) # remove all spaces
236
262
 
263
+ ## todo/fix: use our own downcase - why? why not?
237
264
  name = name.downcase ## do NOT care about upper and lowercase for now
238
265
  name
239
266
  end
@@ -1,123 +1,123 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class Configuration
8
-
9
- ##
10
- ## todo: allow configure of countries_dir like clubs_dir
11
- ## "fallback" and use a default built-in world/countries.txt
12
-
13
- ## todo/check: rename to country_mappings/index - why? why not?
14
- ## or countries_by_code or countries_by_key
15
- def countries
16
- @countries ||= build_country_index
17
- @countries
18
- end
19
-
20
- def build_country_index ## todo/check: rename to setup_country_index or read_country_index - why? why not?
21
- recs = read_csv( "#{SportDb::Boot.data_dir}/world/countries.txt" )
22
- CountryIndex.new( recs )
23
- end
24
-
25
-
26
-
27
- def clubs
28
- @clubs ||= build_club_index
29
- @clubs
30
- end
31
-
32
- ####
33
- # todo/fix: find a better way to configure club / team datasets
34
- attr_accessor :clubs_dir
35
- def clubs_dir() @clubs_dir ||= './clubs'; end
36
-
37
-
38
- CLUBS_REGEX = %r{ (?:^|/) # beginning (^) or beginning of path (/)
39
- (?:[a-z]{1,3}\.)? # optional country code/key e.g. eng.clubs.txt
40
- clubs\.txt$
41
- }x
42
-
43
- def find_clubs_datafiles( path )
44
- datafiles = [] ## note: [country, path] pairs for now
45
-
46
- ## check all txt files as candidates (MUST include country code for now)
47
- candidates = Dir.glob( "#{path}/**/*.txt" )
48
- pp candidates
49
- candidates.each do |candidate|
50
- datafiles << candidate if CLUBS_REGEX.match( candidate )
51
- end
52
-
53
- pp datafiles
54
- datafiles
55
- end
56
-
57
-
58
- def build_club_index
59
- ## unify team names; team (builtin/known/shared) name mappings
60
- ## cleanup team names - use local ("native") name with umlaut etc.
61
- recs = []
62
-
63
- ## todo/fix: pass along / use country code too
64
- ## note: country code no longer needed in path (is now expected as heading inside the file)
65
-
66
- ## todo/fix: add to teamreader
67
- ## check that name and alt_names for a club are all unique (not duplicates)
68
- datafiles = find_clubs_datafiles( clubs_dir )
69
- datafiles.each do |datafile|
70
- recs += ClubReader.read( datafile )
71
- end
72
-
73
-
74
- clubs = ClubIndex.new
75
- clubs.add( recs )
76
-
77
- if clubs.errors?
78
- puts ""
79
- puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
80
- puts " #{clubs.errors.size} errors:"
81
- pp clubs.errors
82
- ## exit 1
83
- end
84
-
85
- clubs
86
- end # method build_club_index
87
-
88
-
89
-
90
-
91
- def leagues
92
- read_leagues() if @leagues.nil?
93
- @leagues
94
- end
95
-
96
- def read_leagues
97
- #####
98
- # add / read-in leagues config
99
- @leagues = LeagueConfig.new
100
-
101
- self ## return self for chaining
102
- end
103
- end # class Configuration
104
-
105
-
106
-
107
-
108
-
109
- ## lets you use
110
- ## SportDb::Import.configure do |config|
111
- ## config.hello = 'World'
112
- ## end
113
-
114
- def self.configure
115
- yield( config )
116
- end
117
-
118
- def self.config
119
- @config ||= Configuration.new
120
- end
121
-
122
- end # module Import
123
- end # module SportDb
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class Configuration
8
+
9
+ ##
10
+ ## todo: allow configure of countries_dir like clubs_dir
11
+ ## "fallback" and use a default built-in world/countries.txt
12
+
13
+ ## todo/check: rename to country_mappings/index - why? why not?
14
+ ## or countries_by_code or countries_by_key
15
+ def countries
16
+ @countries ||= build_country_index
17
+ @countries
18
+ end
19
+
20
+ def build_country_index ## todo/check: rename to setup_country_index or read_country_index - why? why not?
21
+ recs = read_csv( "#{SportDb::Boot.data_dir}/world/countries.txt" )
22
+ CountryIndex.new( recs )
23
+ end
24
+
25
+
26
+
27
+ def clubs
28
+ @clubs ||= build_club_index
29
+ @clubs
30
+ end
31
+
32
+ ####
33
+ # todo/fix: find a better way to configure club / team datasets
34
+ attr_accessor :clubs_dir
35
+ def clubs_dir() @clubs_dir ||= './clubs'; end
36
+
37
+
38
+ CLUBS_REGEX = %r{ (?:^|/) # beginning (^) or beginning of path (/)
39
+ (?:[a-z]{1,3}\.)? # optional country code/key e.g. eng.clubs.txt
40
+ clubs\.txt$
41
+ }x
42
+
43
+ def find_clubs_datafiles( path )
44
+ datafiles = [] ## note: [country, path] pairs for now
45
+
46
+ ## check all txt files as candidates (MUST include country code for now)
47
+ candidates = Dir.glob( "#{path}/**/*.txt" )
48
+ pp candidates
49
+ candidates.each do |candidate|
50
+ datafiles << candidate if CLUBS_REGEX.match( candidate )
51
+ end
52
+
53
+ pp datafiles
54
+ datafiles
55
+ end
56
+
57
+
58
+ def build_club_index
59
+ ## unify team names; team (builtin/known/shared) name mappings
60
+ ## cleanup team names - use local ("native") name with umlaut etc.
61
+ recs = []
62
+
63
+ ## todo/fix: pass along / use country code too
64
+ ## note: country code no longer needed in path (is now expected as heading inside the file)
65
+
66
+ ## todo/fix: add to teamreader
67
+ ## check that name and alt_names for a club are all unique (not duplicates)
68
+ datafiles = find_clubs_datafiles( clubs_dir )
69
+ datafiles.each do |datafile|
70
+ recs += ClubReader.read( datafile )
71
+ end
72
+
73
+
74
+ clubs = ClubIndex.new
75
+ clubs.add( recs )
76
+
77
+ if clubs.errors?
78
+ puts ""
79
+ puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
80
+ puts " #{clubs.errors.size} errors:"
81
+ pp clubs.errors
82
+ ## exit 1
83
+ end
84
+
85
+ clubs
86
+ end # method build_club_index
87
+
88
+
89
+
90
+
91
+ def leagues
92
+ read_leagues() if @leagues.nil?
93
+ @leagues
94
+ end
95
+
96
+ def read_leagues
97
+ #####
98
+ # add / read-in leagues config
99
+ @leagues = LeagueConfig.new
100
+
101
+ self ## return self for chaining
102
+ end
103
+ end # class Configuration
104
+
105
+
106
+
107
+
108
+
109
+ ## lets you use
110
+ ## SportDb::Import.configure do |config|
111
+ ## config.hello = 'World'
112
+ ## end
113
+
114
+ def self.configure
115
+ yield( config )
116
+ end
117
+
118
+ def self.config
119
+ @config ||= Configuration.new
120
+ end
121
+
122
+ end # module Import
123
+ end # module SportDb
@@ -1,118 +1,118 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class LeagueConfig ## use LeagueInfo or LeagueMap or LeagueHash or similar
8
-
9
- def initialize
10
-
11
- ## just use leagues without latest for latest - why? why not?
12
- @leagues_latest = {
13
- 'es' => { '1' => 'liga', # spanish liga 1
14
- '2' => 'liga2', # spanish liga 2
15
- },
16
- 'it' => { '1' => 'seriea', # italian serie a
17
- '2' => 'serieb', # italian serie b
18
- },
19
- 'de' => { '1' => 'bundesliga', # german bundesliga
20
- '2' => 'bundesliga2', # german 2. bundesliga
21
- },
22
- 'nl' => { '1' => 'eredivisie' }, # dutch eredivisie
23
- 'be' => { '1' => 'proleague' }, # belgian pro league
24
- 'pt' => { '1' => 'liga' }, # portugese Primeira Liga
25
- 'tr' => { '1' => 'superlig' }, # turkish Süper Lig
26
-
27
- # note: eng now read from txt
28
- # 'eng' => { '1' => 'premierleague', # english premier league
29
- # '2' => 'championship', # english championship league
30
- # '3' => 'league1', # english league 1
31
- # },
32
- }
33
-
34
- ## change history to past or changes/changelog something - why? why not?
35
- @leagues_history = {
36
-
37
- # note: eng now read from txt
38
- # 'eng' => {
39
- # ## until (including) 2003-04 season
40
- # '2003-04' => { '1' => 'premierleague', # english premier league
41
- # '2' => 'division1', # english division 1
42
- # },
43
- # ## until (including) 1991-92} season
44
- # '1991-92' => { '1' => 'division1', # english division 1
45
- # '2' => 'division2', # english division 2
46
- # }
47
- # }
48
- }
49
-
50
- pp @leagues_latest
51
- pp @leagues_history
52
-
53
- %w(eng sco fr gr).each do |country|
54
- hash = LeagueReader.read( "#{Boot.data_dir}/leagues/#{country}.txt" )
55
- pp hash
56
-
57
- hash.each do |season,league_hash|
58
- if season == '*' ## assume latest / default season
59
- @leagues_latest[ country ] = league_hash
60
- else
61
- @leagues_history[ country ] ||= {}
62
- @leagues_history[ country ][ season ] = league_hash
63
- end
64
- end
65
- end
66
-
67
- pp @leagues_latest
68
- pp @leagues_history
69
- end
70
-
71
-
72
-
73
- def basename( league, country:, season: )
74
- ## todo/check: rename league: to key: - why? why not?
75
-
76
- if country.include?( '-' ) ## assume package name e.g. eng-england etc.
77
- ## cut off country code from package name
78
- cc = country.split( '-' )[0] # use first part
79
- else
80
- cc = country
81
- end
82
-
83
- if season
84
- puts " checking season >#{season}<"
85
- ## check history if season is provided / supplied / known
86
- history = @leagues_history[ cc ]
87
- if history
88
- season_start_year = SeasonUtils.start_year( season ).to_i
89
- ##
90
- ## todo: sorty season keys - why? why not? -- assume reverse chronological order for now
91
- history.keys.reverse.each do |key|
92
- history_season_start_year = SeasonUtils.start_year( key ).to_i
93
- puts " #{season_start_year} <= #{history_season_start_year} - #{season_start_year <= history_season_start_year}"
94
- if season_start_year <= history_season_start_year
95
- result = history[ key ][ league ]
96
- if result
97
- return "#{league}-#{result}"
98
- else
99
- return nil
100
- end
101
- end
102
- end
103
- end
104
- end
105
-
106
- latest = @leagues_latest[ cc ]
107
- if latest
108
- result = latest[ league ]
109
- return "#{league}-#{result}" if result
110
- end
111
-
112
- nil
113
- end # method basename
114
- end # class LeagueConfig
115
-
116
-
117
- end ## module Import
118
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class LeagueConfig ## use LeagueInfo or LeagueMap or LeagueHash or similar
8
+
9
+ def initialize
10
+
11
+ ## just use leagues without latest for latest - why? why not?
12
+ @leagues_latest = {
13
+ 'es' => { '1' => 'liga', # spanish liga 1
14
+ '2' => 'liga2', # spanish liga 2
15
+ },
16
+ 'it' => { '1' => 'seriea', # italian serie a
17
+ '2' => 'serieb', # italian serie b
18
+ },
19
+ 'de' => { '1' => 'bundesliga', # german bundesliga
20
+ '2' => 'bundesliga2', # german 2. bundesliga
21
+ },
22
+ 'nl' => { '1' => 'eredivisie' }, # dutch eredivisie
23
+ 'be' => { '1' => 'proleague' }, # belgian pro league
24
+ 'pt' => { '1' => 'liga' }, # portugese Primeira Liga
25
+ 'tr' => { '1' => 'superlig' }, # turkish Süper Lig
26
+
27
+ # note: eng now read from txt
28
+ # 'eng' => { '1' => 'premierleague', # english premier league
29
+ # '2' => 'championship', # english championship league
30
+ # '3' => 'league1', # english league 1
31
+ # },
32
+ }
33
+
34
+ ## change history to past or changes/changelog something - why? why not?
35
+ @leagues_history = {
36
+
37
+ # note: eng now read from txt
38
+ # 'eng' => {
39
+ # ## until (including) 2003-04 season
40
+ # '2003-04' => { '1' => 'premierleague', # english premier league
41
+ # '2' => 'division1', # english division 1
42
+ # },
43
+ # ## until (including) 1991-92} season
44
+ # '1991-92' => { '1' => 'division1', # english division 1
45
+ # '2' => 'division2', # english division 2
46
+ # }
47
+ # }
48
+ }
49
+
50
+ pp @leagues_latest
51
+ pp @leagues_history
52
+
53
+ %w(eng sco fr gr).each do |country|
54
+ hash = LeagueReader.read( "#{Boot.data_dir}/leagues/#{country}.txt" )
55
+ pp hash
56
+
57
+ hash.each do |season,league_hash|
58
+ if season == '*' ## assume latest / default season
59
+ @leagues_latest[ country ] = league_hash
60
+ else
61
+ @leagues_history[ country ] ||= {}
62
+ @leagues_history[ country ][ season ] = league_hash
63
+ end
64
+ end
65
+ end
66
+
67
+ pp @leagues_latest
68
+ pp @leagues_history
69
+ end
70
+
71
+
72
+
73
+ def basename( league, country:, season: )
74
+ ## todo/check: rename league: to key: - why? why not?
75
+
76
+ if country.include?( '-' ) ## assume package name e.g. eng-england etc.
77
+ ## cut off country code from package name
78
+ cc = country.split( '-' )[0] # use first part
79
+ else
80
+ cc = country
81
+ end
82
+
83
+ if season
84
+ puts " checking season >#{season}<"
85
+ ## check history if season is provided / supplied / known
86
+ history = @leagues_history[ cc ]
87
+ if history
88
+ season_start_year = SeasonUtils.start_year( season ).to_i
89
+ ##
90
+ ## todo: sorty season keys - why? why not? -- assume reverse chronological order for now
91
+ history.keys.reverse.each do |key|
92
+ history_season_start_year = SeasonUtils.start_year( key ).to_i
93
+ puts " #{season_start_year} <= #{history_season_start_year} - #{season_start_year <= history_season_start_year}"
94
+ if season_start_year <= history_season_start_year
95
+ result = history[ key ][ league ]
96
+ if result
97
+ return "#{league}-#{result}"
98
+ else
99
+ return nil
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ latest = @leagues_latest[ cc ]
107
+ if latest
108
+ result = latest[ league ]
109
+ return "#{league}-#{result}" if result
110
+ end
111
+
112
+ nil
113
+ end # method basename
114
+ end # class LeagueConfig
115
+
116
+
117
+ end ## module Import
118
+ end ## module SportDb