sportdb-config 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,23 +55,54 @@ class Club
55
55
  end
56
56
  end
57
57
 
58
+ ###################################
59
+ # "global" helper - move to ___ ? why? why not?
60
+
61
+ YEAR_REGEX = /\([0-9,\- ]+?\)/
62
+ def self.strip_year( name )
63
+ ## check for year(s) e.g. (1887-1911), (-2013),
64
+ ## (1946-2001, 2013-) etc.
65
+ name.gsub( YEAR_REGEX, '' ).strip
66
+ end
67
+
68
+ def self.has_year?( name ) name =~ YEAR_REGEX; end
69
+
70
+ LANG_REGEX = /\[[a-z]{2}\]/
71
+ def self.strip_lang( name )
72
+ name.gsub( LANG_REGEX, '' ).strip
73
+ end
74
+
75
+ def self.has_lang?( name ) name =~ LANG_REGEX; end
76
+
77
+ NORM_REGEX = /[.'º\-\/]/
78
+ ## note: remove all dots (.), dash (-), ', º, /, etc.
79
+ ## for norm(alizing) names
80
+ def self.strip_norm( name )
81
+ name.gsub( NORM_REGEX, '' )
82
+ end
83
+
84
+ def strip_year( name ) self.class.strip_year( name ); end
85
+ def strip_lang( name ) self.class.strip_lang( name ); end
86
+ def strip_norm( name ) self.class.strip_norm( name ); end
87
+
58
88
  private
59
89
  def sanitize( name )
60
- ## check for year(s) e.g. (1887-1911), (-2013) etc.
61
- name = name.gsub( /\([0-9\- ]+?\)/, '' ).strip
90
+ ## check for year(s) e.g. (1887-1911), (-2013),
91
+ ## (1946-2001,2013-) etc.
92
+ name = strip_year( name )
62
93
  ## check lang codes e.g. [en], [fr], etc.
63
- name = name.gsub( /\[[a-z]{2}\]/, '' ).strip
94
+ name = strip_lang( name )
64
95
  name
65
96
  end
66
97
 
67
98
  def normalize( name )
68
- name = sanitize( name)
99
+ name = sanitize( name )
69
100
 
70
- name = name.gsub( '.', '' ) # remove all dots
71
- ## don't report duplicates that only differ with dash (-)
72
- ## e.g. Al Ahly or Al-Ahly for now - why? why not?
73
- name = name.gsub( '-', ' ' ) # replace - with space (e.g. - same as space)
101
+ ## remove all dots (.), dash (-), º, /, etc.
102
+ name = strip_norm( name )
103
+ name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
74
104
 
105
+ ## todo/fix: use our own downcase - why? why not?
75
106
  name = name.downcase ## do NOT care about upper and lowercase for now
76
107
  name
77
108
  end
@@ -97,6 +128,13 @@ class ClubIndex
97
128
  def clubs() @clubs.values; end
98
129
 
99
130
 
131
+ ## helpers from club - use a helper module for includes - why? why not?
132
+ def strip_year( name ) Club.strip_year( name ); end
133
+ def has_year?( name) Club.has_year?( name ); end
134
+ def strip_lang( name ) Club.strip_lang( name ); end
135
+ def strip_norm( name ) Club.strip_norm( name ); end
136
+
137
+
100
138
 
101
139
  def add( rec_or_recs ) ## add club record / alt_names
102
140
  recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
@@ -119,10 +157,11 @@ class ClubIndex
119
157
  names = [rec.name] + rec.alt_names
120
158
  more_names = []
121
159
  ## check "hand-typed" names for year (auto-add)
122
- ## check for year(s) e.g. (1887-1911), (-2013) etc.
160
+ ## check for year(s) e.g. (1887-1911), (-2013),
161
+ ## (1946-2001,2013-) etc.
123
162
  names.each do |name|
124
- if name =~ /\([0-9\- ]+?\)/
125
- more_names << name.gsub( /\([0-9\- ]+?\)/, '' ).strip
163
+ if has_year?( name )
164
+ more_names << strip_year( name )
126
165
  end
127
166
  end
128
167
 
@@ -151,7 +190,7 @@ class ClubIndex
151
190
 
152
191
  names.each_with_index do |name,i|
153
192
  ## check lang codes e.g. [en], [fr], etc.
154
- name = name.gsub( /\[[a-z]{2}\]/, '' ).strip
193
+ name = strip_lang( name )
155
194
  norm = normalize( name )
156
195
  alt_recs = @clubs_by_name[ norm ]
157
196
  if alt_recs
@@ -206,17 +245,6 @@ class ClubIndex
206
245
 
207
246
 
208
247
  def dump_duplicates # debug helper - report duplicate club name records
209
-
210
- ## todo/fix: remove club.duplicates - alreay included in reports -see TeamDuplicatePart
211
- ## more a "feature" of Clubs than ClubIndex class - why? why not?
212
- @clubs.values.each do |club|
213
- if club.duplicates?
214
- duplicates = club.duplicates
215
- puts "#{duplicates.size} (norm) name duplicate(s) for #{club.name}, #{club.country.name}:"
216
- pp duplicates
217
- end
218
- end
219
-
220
248
  @clubs_by_name.each do |name, clubs|
221
249
  if clubs.size > 1
222
250
  puts "#{clubs.size} matching club duplicates for >#{name}<:"
@@ -229,11 +257,10 @@ class ClubIndex
229
257
 
230
258
  private
231
259
  def normalize( name )
232
- name = name.gsub( '.', '' ) # remove all dots
233
- ## don't report duplicates that only differ with dash (-)
234
- ## e.g. Al Ahly or Al-Ahly for now - why? why not?
235
- name = name.gsub( '-', ' ' ) # replace - with space (e.g. - same as space)
260
+ name = strip_norm( name )
261
+ name = name.gsub( ' ', '' ) # remove all spaces
236
262
 
263
+ ## todo/fix: use our own downcase - why? why not?
237
264
  name = name.downcase ## do NOT care about upper and lowercase for now
238
265
  name
239
266
  end
@@ -1,123 +1,123 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class Configuration
8
-
9
- ##
10
- ## todo: allow configure of countries_dir like clubs_dir
11
- ## "fallback" and use a default built-in world/countries.txt
12
-
13
- ## todo/check: rename to country_mappings/index - why? why not?
14
- ## or countries_by_code or countries_by_key
15
- def countries
16
- @countries ||= build_country_index
17
- @countries
18
- end
19
-
20
- def build_country_index ## todo/check: rename to setup_country_index or read_country_index - why? why not?
21
- recs = read_csv( "#{SportDb::Boot.data_dir}/world/countries.txt" )
22
- CountryIndex.new( recs )
23
- end
24
-
25
-
26
-
27
- def clubs
28
- @clubs ||= build_club_index
29
- @clubs
30
- end
31
-
32
- ####
33
- # todo/fix: find a better way to configure club / team datasets
34
- attr_accessor :clubs_dir
35
- def clubs_dir() @clubs_dir ||= './clubs'; end
36
-
37
-
38
- CLUBS_REGEX = %r{ (?:^|/) # beginning (^) or beginning of path (/)
39
- (?:[a-z]{1,3}\.)? # optional country code/key e.g. eng.clubs.txt
40
- clubs\.txt$
41
- }x
42
-
43
- def find_clubs_datafiles( path )
44
- datafiles = [] ## note: [country, path] pairs for now
45
-
46
- ## check all txt files as candidates (MUST include country code for now)
47
- candidates = Dir.glob( "#{path}/**/*.txt" )
48
- pp candidates
49
- candidates.each do |candidate|
50
- datafiles << candidate if CLUBS_REGEX.match( candidate )
51
- end
52
-
53
- pp datafiles
54
- datafiles
55
- end
56
-
57
-
58
- def build_club_index
59
- ## unify team names; team (builtin/known/shared) name mappings
60
- ## cleanup team names - use local ("native") name with umlaut etc.
61
- recs = []
62
-
63
- ## todo/fix: pass along / use country code too
64
- ## note: country code no longer needed in path (is now expected as heading inside the file)
65
-
66
- ## todo/fix: add to teamreader
67
- ## check that name and alt_names for a club are all unique (not duplicates)
68
- datafiles = find_clubs_datafiles( clubs_dir )
69
- datafiles.each do |datafile|
70
- recs += ClubReader.read( datafile )
71
- end
72
-
73
-
74
- clubs = ClubIndex.new
75
- clubs.add( recs )
76
-
77
- if clubs.errors?
78
- puts ""
79
- puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
80
- puts " #{clubs.errors.size} errors:"
81
- pp clubs.errors
82
- ## exit 1
83
- end
84
-
85
- clubs
86
- end # method build_club_index
87
-
88
-
89
-
90
-
91
- def leagues
92
- read_leagues() if @leagues.nil?
93
- @leagues
94
- end
95
-
96
- def read_leagues
97
- #####
98
- # add / read-in leagues config
99
- @leagues = LeagueConfig.new
100
-
101
- self ## return self for chaining
102
- end
103
- end # class Configuration
104
-
105
-
106
-
107
-
108
-
109
- ## lets you use
110
- ## SportDb::Import.configure do |config|
111
- ## config.hello = 'World'
112
- ## end
113
-
114
- def self.configure
115
- yield( config )
116
- end
117
-
118
- def self.config
119
- @config ||= Configuration.new
120
- end
121
-
122
- end # module Import
123
- end # module SportDb
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class Configuration
8
+
9
+ ##
10
+ ## todo: allow configure of countries_dir like clubs_dir
11
+ ## "fallback" and use a default built-in world/countries.txt
12
+
13
+ ## todo/check: rename to country_mappings/index - why? why not?
14
+ ## or countries_by_code or countries_by_key
15
+ def countries
16
+ @countries ||= build_country_index
17
+ @countries
18
+ end
19
+
20
+ def build_country_index ## todo/check: rename to setup_country_index or read_country_index - why? why not?
21
+ recs = read_csv( "#{SportDb::Boot.data_dir}/world/countries.txt" )
22
+ CountryIndex.new( recs )
23
+ end
24
+
25
+
26
+
27
+ def clubs
28
+ @clubs ||= build_club_index
29
+ @clubs
30
+ end
31
+
32
+ ####
33
+ # todo/fix: find a better way to configure club / team datasets
34
+ attr_accessor :clubs_dir
35
+ def clubs_dir() @clubs_dir ||= './clubs'; end
36
+
37
+
38
+ CLUBS_REGEX = %r{ (?:^|/) # beginning (^) or beginning of path (/)
39
+ (?:[a-z]{1,3}\.)? # optional country code/key e.g. eng.clubs.txt
40
+ clubs\.txt$
41
+ }x
42
+
43
+ def find_clubs_datafiles( path )
44
+ datafiles = [] ## note: [country, path] pairs for now
45
+
46
+ ## check all txt files as candidates (MUST include country code for now)
47
+ candidates = Dir.glob( "#{path}/**/*.txt" )
48
+ pp candidates
49
+ candidates.each do |candidate|
50
+ datafiles << candidate if CLUBS_REGEX.match( candidate )
51
+ end
52
+
53
+ pp datafiles
54
+ datafiles
55
+ end
56
+
57
+
58
+ def build_club_index
59
+ ## unify team names; team (builtin/known/shared) name mappings
60
+ ## cleanup team names - use local ("native") name with umlaut etc.
61
+ recs = []
62
+
63
+ ## todo/fix: pass along / use country code too
64
+ ## note: country code no longer needed in path (is now expected as heading inside the file)
65
+
66
+ ## todo/fix: add to teamreader
67
+ ## check that name and alt_names for a club are all unique (not duplicates)
68
+ datafiles = find_clubs_datafiles( clubs_dir )
69
+ datafiles.each do |datafile|
70
+ recs += ClubReader.read( datafile )
71
+ end
72
+
73
+
74
+ clubs = ClubIndex.new
75
+ clubs.add( recs )
76
+
77
+ if clubs.errors?
78
+ puts ""
79
+ puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
80
+ puts " #{clubs.errors.size} errors:"
81
+ pp clubs.errors
82
+ ## exit 1
83
+ end
84
+
85
+ clubs
86
+ end # method build_club_index
87
+
88
+
89
+
90
+
91
+ def leagues
92
+ read_leagues() if @leagues.nil?
93
+ @leagues
94
+ end
95
+
96
+ def read_leagues
97
+ #####
98
+ # add / read-in leagues config
99
+ @leagues = LeagueConfig.new
100
+
101
+ self ## return self for chaining
102
+ end
103
+ end # class Configuration
104
+
105
+
106
+
107
+
108
+
109
+ ## lets you use
110
+ ## SportDb::Import.configure do |config|
111
+ ## config.hello = 'World'
112
+ ## end
113
+
114
+ def self.configure
115
+ yield( config )
116
+ end
117
+
118
+ def self.config
119
+ @config ||= Configuration.new
120
+ end
121
+
122
+ end # module Import
123
+ end # module SportDb
@@ -1,118 +1,118 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class LeagueConfig ## use LeagueInfo or LeagueMap or LeagueHash or similar
8
-
9
- def initialize
10
-
11
- ## just use leagues without latest for latest - why? why not?
12
- @leagues_latest = {
13
- 'es' => { '1' => 'liga', # spanish liga 1
14
- '2' => 'liga2', # spanish liga 2
15
- },
16
- 'it' => { '1' => 'seriea', # italian serie a
17
- '2' => 'serieb', # italian serie b
18
- },
19
- 'de' => { '1' => 'bundesliga', # german bundesliga
20
- '2' => 'bundesliga2', # german 2. bundesliga
21
- },
22
- 'nl' => { '1' => 'eredivisie' }, # dutch eredivisie
23
- 'be' => { '1' => 'proleague' }, # belgian pro league
24
- 'pt' => { '1' => 'liga' }, # portugese Primeira Liga
25
- 'tr' => { '1' => 'superlig' }, # turkish Süper Lig
26
-
27
- # note: eng now read from txt
28
- # 'eng' => { '1' => 'premierleague', # english premier league
29
- # '2' => 'championship', # english championship league
30
- # '3' => 'league1', # english league 1
31
- # },
32
- }
33
-
34
- ## change history to past or changes/changelog something - why? why not?
35
- @leagues_history = {
36
-
37
- # note: eng now read from txt
38
- # 'eng' => {
39
- # ## until (including) 2003-04 season
40
- # '2003-04' => { '1' => 'premierleague', # english premier league
41
- # '2' => 'division1', # english division 1
42
- # },
43
- # ## until (including) 1991-92} season
44
- # '1991-92' => { '1' => 'division1', # english division 1
45
- # '2' => 'division2', # english division 2
46
- # }
47
- # }
48
- }
49
-
50
- pp @leagues_latest
51
- pp @leagues_history
52
-
53
- %w(eng sco fr gr).each do |country|
54
- hash = LeagueReader.read( "#{Boot.data_dir}/leagues/#{country}.txt" )
55
- pp hash
56
-
57
- hash.each do |season,league_hash|
58
- if season == '*' ## assume latest / default season
59
- @leagues_latest[ country ] = league_hash
60
- else
61
- @leagues_history[ country ] ||= {}
62
- @leagues_history[ country ][ season ] = league_hash
63
- end
64
- end
65
- end
66
-
67
- pp @leagues_latest
68
- pp @leagues_history
69
- end
70
-
71
-
72
-
73
- def basename( league, country:, season: )
74
- ## todo/check: rename league: to key: - why? why not?
75
-
76
- if country.include?( '-' ) ## assume package name e.g. eng-england etc.
77
- ## cut off country code from package name
78
- cc = country.split( '-' )[0] # use first part
79
- else
80
- cc = country
81
- end
82
-
83
- if season
84
- puts " checking season >#{season}<"
85
- ## check history if season is provided / supplied / known
86
- history = @leagues_history[ cc ]
87
- if history
88
- season_start_year = SeasonUtils.start_year( season ).to_i
89
- ##
90
- ## todo: sorty season keys - why? why not? -- assume reverse chronological order for now
91
- history.keys.reverse.each do |key|
92
- history_season_start_year = SeasonUtils.start_year( key ).to_i
93
- puts " #{season_start_year} <= #{history_season_start_year} - #{season_start_year <= history_season_start_year}"
94
- if season_start_year <= history_season_start_year
95
- result = history[ key ][ league ]
96
- if result
97
- return "#{league}-#{result}"
98
- else
99
- return nil
100
- end
101
- end
102
- end
103
- end
104
- end
105
-
106
- latest = @leagues_latest[ cc ]
107
- if latest
108
- result = latest[ league ]
109
- return "#{league}-#{result}" if result
110
- end
111
-
112
- nil
113
- end # method basename
114
- end # class LeagueConfig
115
-
116
-
117
- end ## module Import
118
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class LeagueConfig ## use LeagueInfo or LeagueMap or LeagueHash or similar
8
+
9
+ def initialize
10
+
11
+ ## just use leagues without latest for latest - why? why not?
12
+ @leagues_latest = {
13
+ 'es' => { '1' => 'liga', # spanish liga 1
14
+ '2' => 'liga2', # spanish liga 2
15
+ },
16
+ 'it' => { '1' => 'seriea', # italian serie a
17
+ '2' => 'serieb', # italian serie b
18
+ },
19
+ 'de' => { '1' => 'bundesliga', # german bundesliga
20
+ '2' => 'bundesliga2', # german 2. bundesliga
21
+ },
22
+ 'nl' => { '1' => 'eredivisie' }, # dutch eredivisie
23
+ 'be' => { '1' => 'proleague' }, # belgian pro league
24
+ 'pt' => { '1' => 'liga' }, # portugese Primeira Liga
25
+ 'tr' => { '1' => 'superlig' }, # turkish Süper Lig
26
+
27
+ # note: eng now read from txt
28
+ # 'eng' => { '1' => 'premierleague', # english premier league
29
+ # '2' => 'championship', # english championship league
30
+ # '3' => 'league1', # english league 1
31
+ # },
32
+ }
33
+
34
+ ## change history to past or changes/changelog something - why? why not?
35
+ @leagues_history = {
36
+
37
+ # note: eng now read from txt
38
+ # 'eng' => {
39
+ # ## until (including) 2003-04 season
40
+ # '2003-04' => { '1' => 'premierleague', # english premier league
41
+ # '2' => 'division1', # english division 1
42
+ # },
43
+ # ## until (including) 1991-92} season
44
+ # '1991-92' => { '1' => 'division1', # english division 1
45
+ # '2' => 'division2', # english division 2
46
+ # }
47
+ # }
48
+ }
49
+
50
+ pp @leagues_latest
51
+ pp @leagues_history
52
+
53
+ %w(eng sco fr gr).each do |country|
54
+ hash = LeagueReader.read( "#{Boot.data_dir}/leagues/#{country}.txt" )
55
+ pp hash
56
+
57
+ hash.each do |season,league_hash|
58
+ if season == '*' ## assume latest / default season
59
+ @leagues_latest[ country ] = league_hash
60
+ else
61
+ @leagues_history[ country ] ||= {}
62
+ @leagues_history[ country ][ season ] = league_hash
63
+ end
64
+ end
65
+ end
66
+
67
+ pp @leagues_latest
68
+ pp @leagues_history
69
+ end
70
+
71
+
72
+
73
+ def basename( league, country:, season: )
74
+ ## todo/check: rename league: to key: - why? why not?
75
+
76
+ if country.include?( '-' ) ## assume package name e.g. eng-england etc.
77
+ ## cut off country code from package name
78
+ cc = country.split( '-' )[0] # use first part
79
+ else
80
+ cc = country
81
+ end
82
+
83
+ if season
84
+ puts " checking season >#{season}<"
85
+ ## check history if season is provided / supplied / known
86
+ history = @leagues_history[ cc ]
87
+ if history
88
+ season_start_year = SeasonUtils.start_year( season ).to_i
89
+ ##
90
+ ## todo: sorty season keys - why? why not? -- assume reverse chronological order for now
91
+ history.keys.reverse.each do |key|
92
+ history_season_start_year = SeasonUtils.start_year( key ).to_i
93
+ puts " #{season_start_year} <= #{history_season_start_year} - #{season_start_year <= history_season_start_year}"
94
+ if season_start_year <= history_season_start_year
95
+ result = history[ key ][ league ]
96
+ if result
97
+ return "#{league}-#{result}"
98
+ else
99
+ return nil
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ latest = @leagues_latest[ cc ]
107
+ if latest
108
+ result = latest[ league ]
109
+ return "#{league}-#{result}" if result
110
+ end
111
+
112
+ nil
113
+ end # method basename
114
+ end # class LeagueConfig
115
+
116
+
117
+ end ## module Import
118
+ end ## module SportDb