sportdb-formats 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,75 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubPropsReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
15
+ parse( txt )
16
+ end
17
+
18
+ def self.parse( txt )
19
+ new( txt ).parse
20
+ end
21
+
22
+
23
+ def initialize( txt )
24
+ @txt = txt
25
+ end
26
+
27
+ def parse
28
+ recs = parse_csv( @txt )
29
+ recs.each do |rec|
30
+ name = rec[:name]
31
+ if name.nil?
32
+ puts "** !!! ERROR !!! Name column required / missing / NOT found in row:"
33
+ pp rec
34
+ exit 1
35
+ end
36
+
37
+ ## find / match club by (canocial) name
38
+ m = catalog.clubs.match( name )
39
+ if m && m.size > 1
40
+ puts "** !!! WARN !!! ambigious (multiple) club matches (#{m.size}) for name >#{name}< in props row:"
41
+ pp rec
42
+ pp m
43
+
44
+ ## todo/fix: try filter by canonical name if more than one match
45
+ m = m.select { |club| club.name == name }
46
+ m = nil if m.empty? ## note: reset to nil if no more matches
47
+ end
48
+
49
+ if m.nil?
50
+ puts "** !!! ERROR !!! no club match for (canonical) name >#{name}< in props row:"
51
+ pp rec
52
+ exit 1
53
+ elsif m.size > 1
54
+ puts "** !!! ERROR !!! ambigious (multiple) club matches (#{m.size}) for (canonical) name >#{name}< in props row:"
55
+ pp rec
56
+ pp m
57
+ exit 1
58
+ else ## assume size == 1, bingo!!!
59
+ club_rec = m[0]
60
+ ## todo/fix: warn if name differes from (canonical) name
61
+ ## todo/fix: also add props to in-memory structs/records!!!
62
+ ## todo/fix: only updated "on-demand" from in-memory struct/records!!!!
63
+
64
+ ## update attributes
65
+ club_rec.key = rec[:key] if rec[:key]
66
+ club_rec.code = rec[:code] if rec[:code]
67
+ ## todo/fix: add (some) more props e.g. address, web, etc.
68
+ end
69
+ end
70
+ end # method parse
71
+
72
+ end # class ClubPropsReader
73
+
74
+ end ## module Import
75
+ end ## module SportDb
@@ -0,0 +1,114 @@
1
+
2
+ module SportDb
3
+ module Import
4
+
5
+ class NationalTeamIndex
6
+
7
+ attr_reader :teams ## all (national) team records
8
+
9
+ def initialize( recs )
10
+ @teams = []
11
+ @teams_by_code = {}
12
+ @teams_by_name = {}
13
+
14
+ add( recs )
15
+ end
16
+
17
+ include NameHelper
18
+ ## incl. strip_year( name )
19
+ ## has_year?( name)
20
+ ## strip_lang( name )
21
+ ## normalize( name )
22
+
23
+
24
+ def add( recs )
25
+ ###########################################
26
+ ## auto-fill national teams
27
+ ## pp recs
28
+ recs.each do |rec|
29
+ @teams << rec
30
+
31
+ ## add fifa code lookup
32
+ if @teams_by_code[ rec.code.downcase ]
33
+ puts "** !! ERROR !! national team code (code) >#{rec.code}< already exits!!"
34
+ exit 1
35
+ else
36
+ @teams_by_code[ rec.code.downcase ] = rec
37
+ end
38
+
39
+
40
+ ## add all names (canonical name + alt names
41
+ names = [rec.name] + rec.alt_names
42
+ more_names = []
43
+ ## check "hand-typed" names for year (auto-add)
44
+ ## check for year(s) e.g. (1887-1911), (-2013),
45
+ ## (1946-2001,2013-) etc.
46
+ names.each do |name|
47
+ if has_year?( name )
48
+ more_names << strip_year( name )
49
+ end
50
+ end
51
+
52
+ names += more_names
53
+ ## check for duplicates - simple check for now - fix/improve
54
+ ## todo/fix: (auto)remove duplicates - why? why not?
55
+ count = names.size
56
+ count_uniq = names.uniq.size
57
+ if count != count_uniq
58
+ puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s) in national teams:"
59
+ pp names
60
+ pp rec
61
+ exit 1
62
+ end
63
+
64
+ names.each_with_index do |name,i|
65
+ ## check lang codes e.g. [en], [fr], etc.
66
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
67
+ name = strip_lang( name )
68
+ norm = normalize( name )
69
+ old_rec = @teams_by_name[ norm ]
70
+ if old_rec
71
+ ## check if tame name already is included or is new team rec
72
+ msg = "** !!! ERROR !!! - national team name conflict/duplicate - >#{name}< will overwrite >#{old_rec.name}< with >#{rec.name}<"
73
+ puts msg
74
+ exit 1
75
+ else
76
+ @teams_by_name[ norm ] = rec
77
+ end
78
+ end
79
+ end ## each record
80
+ end # method initialize
81
+
82
+ ## fix/todo: add find_by (alias for find_by_name/find_by_code)
83
+ def find_by_code( code )
84
+ code = code.to_s.downcase ## allow symbols (and always downcase e.g. AUT to aut etc.)
85
+ @teams_by_code[ code ]
86
+ end
87
+
88
+ def find_by_name( name )
89
+ name = normalize( name.to_s ) ## allow symbols too (e.g. use to.s first)
90
+ @teams_by_name[ name ]
91
+ end
92
+
93
+ def find( q )
94
+ ## check longest match first (assume name is longer than code)
95
+ ## try lookup / find by (normalized) name first
96
+ team = find_by_name( q )
97
+ team = find_by_code( q ) if team.nil?
98
+ team
99
+ end
100
+
101
+ def find!( q )
102
+ team = find( q )
103
+ if team.nil?
104
+ puts "** !!! ERROR - no match for national team >#{q}< found"
105
+ exit 1
106
+ end
107
+ team
108
+ end
109
+ end # class NationalTeamIndex
110
+
111
+
112
+ end # module Import
113
+ end # module SportDb
114
+
@@ -0,0 +1,43 @@
1
+
2
+ module SportDb
3
+ module Import
4
+
5
+
6
+ class TeamIndex
7
+ ## note: "virtual" index lets you search clubs and/or national_teams (don't care)
8
+
9
+ def catalog() Import.catalog; end
10
+
11
+ ## todo/check: rename to/use map_by! for array version - why? why not?
12
+ def find_by!( name:, league:, mods: nil )
13
+ if name.is_a?( Array )
14
+ recs = []
15
+ name.each do |q|
16
+ recs << __find_by!( name: q, league: league, mods: mods )
17
+ end
18
+ recs
19
+ else ## assume single name
20
+ __find_by!( name: name, league: league, mods: mods )
21
+ end
22
+ end
23
+
24
+ def __find_by!( name:, league:, mods: nil )
25
+ if mods && mods[ league.key ] && mods[ league.key ][ name ]
26
+ mods[ league.key ][ name ]
27
+ else
28
+ if league.clubs?
29
+ if league.intl? ## todo/fix: add intl? to ActiveRecord league!!!
30
+ catalog.clubs.find!( name )
31
+ else ## assume clubs in domestic/national league tournament
32
+ catalog.clubs.find_by!( name: name, country: league.country )
33
+ end
34
+ else ## assume national teams (not clubs)
35
+ catalog.national_teams.find!( name )
36
+ end
37
+ end
38
+ end # method __find_by!
39
+
40
+ end # class TeamIndex
41
+
42
+ end # module Import
43
+ end # module SportDb
@@ -0,0 +1,108 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class WikiReader ## todo/check: rename to WikiClubReader - why? why not?
9
+
10
+ class WikiClub # nested class
11
+ attr_reader :name, :country
12
+ def initialize( name, country )
13
+ @name, @country = name, country
14
+ end
15
+ end # (nested) class WikiClub
16
+
17
+
18
+ def catalog() Import.catalog; end
19
+
20
+
21
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
22
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
23
+ parse( txt )
24
+ end
25
+
26
+ def self.parse( txt )
27
+ new( txt ).parse
28
+ end
29
+
30
+ def initialize( txt )
31
+ @txt = txt
32
+ end
33
+
34
+ def parse
35
+ recs = []
36
+ last_country = nil ## note: supports only one level of headings for now (and that is a country)
37
+
38
+ @txt.each_line do |line|
39
+ line = line.strip
40
+
41
+ next if line.empty?
42
+ next if line.start_with?( '#' ) ## skip comments too
43
+
44
+ ## strip inline (until end-of-line) comments too
45
+ ## e.g Eupen => KAS Eupen, ## [de]
46
+ ## => Eupen => KAS Eupen,
47
+ line = line.sub( /#.*/, '' ).strip
48
+ pp line
49
+
50
+
51
+ next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
52
+
53
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
54
+ ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
55
+ if line =~ /^(={1,}) ## leading ======
56
+ ([^=]+?) ## text (note: for now no "inline" = allowed)
57
+ =* ## (optional) trailing ====
58
+ $/x
59
+ heading_marker = $1
60
+ heading_level = $1.length ## count number of = for heading level
61
+ heading = $2.strip
62
+
63
+ puts "heading #{heading_level} >#{heading}<"
64
+
65
+ if heading_level > 1
66
+ puts "** !!! ERROR [wiki reader] !!! - - headings level too deep - only top / one level supported for now; sorry"
67
+ exit 1
68
+ end
69
+
70
+ ## assume country in heading; allow all "formats" supported by parse e.g.
71
+ ## Österreich • Austria (at)
72
+ ## Österreich • Austria
73
+ ## Austria
74
+ ## Deutschland (de) • Germany
75
+ country = catalog.countries.parse( heading )
76
+ ## check country code - MUST exist for now!!!!
77
+ if country.nil?
78
+ puts "!!! error [wiki reader] - unknown country >#{heading}< - sorry - add country to config to fix"
79
+ exit 1
80
+ end
81
+
82
+ last_country = country
83
+ pp last_country
84
+ else
85
+ ## strip and squish (white)spaces
86
+ # e.g. New York FC (2011-) => New York FC (2011-)
87
+ value = line.strip.gsub( /[ \t]+/, ' ' )
88
+
89
+ ## normalize (allow underscore (-) - replace with space)
90
+ ## e.g. Cercle_Brugge_K.S.V. => Cercle Brugge K.S.V.
91
+ value = value.gsub( '_', ' ' )
92
+
93
+ if last_country.nil?
94
+ puts "** !!! ERROR [wiki reader] !!! - country heading missing for club name; sorry - add country heading to fix"
95
+ exit 1
96
+ end
97
+
98
+ rec = WikiClub.new( value, last_country )
99
+ recs << rec
100
+ end
101
+ end # each_line
102
+ recs
103
+ end # method read
104
+
105
+ end # class WikiReader
106
+
107
+ end ## module Import
108
+ end ## module SportDb
@@ -6,7 +6,7 @@ module Formats
6
6
 
7
7
  MAJOR = 1 ## todo: namespace inside version or something - why? why not??
8
8
  MINOR = 0
9
- PATCH = 0
9
+ PATCH = 1
10
10
  VERSION = [MAJOR,MINOR,PATCH].join('.')
11
11
 
12
12
  def self.version
data/test/helper.rb CHANGED
@@ -10,6 +10,78 @@ require 'minitest/autorun'
10
10
  require 'sportdb/formats'
11
11
 
12
12
 
13
+
14
+ module SportDb
15
+ module Import
16
+
17
+ class TestCatalog
18
+ def build_country_index
19
+ recs = CountryReader.read( "#{Test.data_dir}/world/countries.txt" )
20
+ index = CountryIndex.new( recs )
21
+ index
22
+ end
23
+
24
+ def build_league_index
25
+ recs = SportDb::Import::LeagueReader.parse( <<TXT )
26
+ = England =
27
+ 1 English Premier League
28
+ | ENG PL | England Premier League | Premier League
29
+ 2 English Championship
30
+ | ENG CS | England Championship | Championship
31
+ 3 English League One
32
+ | England League One | League One
33
+ 4 English League Two
34
+ 5 English National League
35
+
36
+ cup EFL Cup
37
+ | League Cup | Football League Cup
38
+ | ENG LC | England Liga Cup
39
+
40
+ = Scotland =
41
+ 1 Scottish Premiership
42
+ 2 Scottish Championship
43
+ 3 Scottish League One
44
+ 4 Scottish League Two
45
+ TXT
46
+
47
+ leagues = SportDb::Import::LeagueIndex.new
48
+ leagues.add( recs )
49
+ leagues
50
+ end
51
+
52
+ def build_club_index
53
+ recs = ClubReader.parse( <<TXT )
54
+ = England
55
+
56
+ Chelsea FC
57
+ Arsenal FC
58
+ Tottenham Hotspur
59
+ West Ham United
60
+ Crystal Palace
61
+ Manchester United
62
+ Manchester City
63
+ TXT
64
+
65
+ index = ClubIndex.new
66
+ index.add( recs )
67
+ index
68
+ end
69
+
70
+
71
+ def countries() @countries ||= build_country_index; end
72
+ def leagues() @leagues ||= build_league_index; end
73
+ def clubs() @clubs ||= build_club_index; end
74
+ end
75
+
76
+ configure do |config|
77
+ config.catalog = TestCatalog.new
78
+ end
79
+
80
+ end # module Import
81
+ end # module SportDb
82
+
83
+
84
+
13
85
  ################
14
86
  ## helper
15
87
 
@@ -0,0 +1,183 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_club_index.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ class TestClubIndex < MiniTest::Test
11
+
12
+ def test_match
13
+
14
+ recs = SportDb::Import::ClubReader.parse( <<TXT )
15
+ =================================
16
+ = Austria
17
+
18
+ == Wien ==
19
+
20
+ FK Austria Wien, Wien (Favoriten)
21
+ | Austria Vienna | Austria Wien
22
+ SK Rapid Wien, Wien (Hütteldorf)
23
+ | Rapid Vienna | Rapid Wien
24
+
25
+
26
+ ====================================
27
+ = England
28
+
29
+ Arsenal FC, 1886, @ Emirates Stadium, London ## Greater London
30
+ | Arsenal | FC Arsenal
31
+ Chelsea FC, 1905, @ Stamford Bridge, London ## Greater London
32
+ | Chelsea | FC Chelsea
33
+
34
+
35
+ =====================================
36
+ = Russia
37
+
38
+ Arsenal Tula, Tula
39
+ | Arsenal | FC Arsenal Tula
40
+
41
+
42
+ ===========================
43
+ = Argentina
44
+
45
+ == Buenos Aires ==
46
+
47
+ Arsenal de Sarandí, Sarandí › Buenos Aires # Sarandí es una ciudad de la Zona Sur del Gran Buenos Aires
48
+ | Arsenal | Arsenal Sarandi
49
+ | Arsenal FC | Arsenal Fútbol Club
50
+ TXT
51
+
52
+ clubs = SportDb::Import::ClubIndex.new
53
+ clubs.add( recs )
54
+
55
+ pp clubs.errors
56
+
57
+ clubs.dump_duplicates
58
+
59
+ m = clubs.match( 'Rapid Wien' )
60
+ assert_equal 'SK Rapid Wien', m[0].name
61
+ assert_equal 'Austria', m[0].country.name
62
+ assert_equal 'Wien', m[0].city
63
+
64
+ m = clubs.match( 'rapid wien' )
65
+ assert_equal 'SK Rapid Wien', m[0].name
66
+ assert_equal 'Austria', m[0].country.name
67
+ assert_equal 'Wien', m[0].city
68
+
69
+ ## note: all dots (.) get always removed
70
+ m = clubs.match( '...r.a.p.i.d w.i.e.n...' )
71
+ assert_equal 'SK Rapid Wien', m[0].name
72
+ assert_equal 'Austria', m[0].country.name
73
+ assert_equal 'Wien', m[0].city
74
+
75
+ ## note: all spaces and dashes (-) get always removed
76
+ m = clubs.match( '--- r a p i d w i e n ---' )
77
+ assert_equal 'SK Rapid Wien', m[0].name
78
+ assert_equal 'Austria', m[0].country.name
79
+ assert_equal 'Wien', m[0].city
80
+
81
+ m = clubs.match( 'RAPID WIEN' )
82
+ assert_equal 'SK Rapid Wien', m[0].name
83
+ assert_equal 'Austria', m[0].country.name
84
+ assert_equal 'Wien', m[0].city
85
+
86
+
87
+ c = clubs[ 'SK Rapid Wien' ] ## check canoncial name match (only)
88
+ assert_equal 'SK Rapid Wien', c.name
89
+ assert_equal 'Austria', c.country.name
90
+ assert_equal 'Wien', c.city
91
+
92
+
93
+ m = clubs.match( 'Arsenal' )
94
+ assert_equal 3, m.size
95
+
96
+ m = clubs.match( 'ARSENAL' )
97
+ assert_equal 3, m.size
98
+
99
+ m = clubs.match_by( name: 'Arsenal', country: 'eng' )
100
+ assert_equal 1, m.size
101
+ assert_equal 'Arsenal FC', m[0].name
102
+ assert_equal 'England', m[0].country.name
103
+ assert_equal 'London', m[0].city
104
+
105
+ club = clubs.find_by!( name: 'Arsenal', country: 'eng' )
106
+ assert_equal 'Arsenal FC', club.name
107
+ assert_equal 'England', club.country.name
108
+ assert_equal 'London', club.city
109
+
110
+
111
+ m = clubs.match_by( name: 'Arsenal', country: 'ar' )
112
+ assert_equal 1, m.size
113
+ assert_equal 'Arsenal de Sarandí', m[0].name
114
+ assert_equal 'Argentina', m[0].country.name
115
+ assert_equal 'Sarandí', m[0].city
116
+
117
+ club = clubs.find_by!( name: 'Arsenal', country: 'ar' )
118
+ assert_equal 'Arsenal de Sarandí', club.name
119
+ assert_equal 'Argentina', club.country.name
120
+ assert_equal 'Sarandí', club.city
121
+
122
+
123
+ m = clubs.match_by( name: 'Arsenal', country: 'ru' )
124
+ assert_equal 1, m.size
125
+ assert_equal 'Arsenal Tula', m[0].name
126
+ assert_equal 'Russia', m[0].country.name
127
+ assert_equal 'Tula', m[0].city
128
+
129
+
130
+ m = clubs.match( 'Arsenal FC' )
131
+ assert_equal 2, m.size
132
+
133
+ m = clubs.match( 'Arsenal F.C.' )
134
+ assert_equal 2, m.size
135
+
136
+ m = clubs.match( '...A.r.s.e.n.a.l... F.C...' )
137
+ assert_equal 2, m.size
138
+ end
139
+
140
+
141
+ def test_wikipedia # test wikipedia names and links/urls
142
+
143
+ recs = SportDb::Import::ClubReader.parse( <<TXT )
144
+ ==================================
145
+ = Belgium
146
+
147
+ == Brussels ==
148
+
149
+ RSC Anderlecht, 1908, Brussels ## use (just) Anderlecht or Brussel-Anderlecht ??
150
+ | Anderlecht | R.S.C. Anderlecht | Royal Sporting Club Anderlecht
151
+
152
+ == West-Vlaanderen › Vlaanderen ==
153
+
154
+ Club Brugge, 1891, Brugge › West-Vlaanderen › Vlaanderen
155
+ | Club Brugge KV | Club Brugge Koninklijke Voetbalvereniging
156
+ TXT
157
+
158
+
159
+ clubs = SportDb::Import::ClubIndex.new
160
+ clubs.add( recs )
161
+
162
+ recs = SportDb::Import::WikiReader.parse( <<TXT )
163
+ ==================================
164
+ = Belgium
165
+
166
+ R.S.C. Anderlecht
167
+ Club Brugge KV
168
+ TXT
169
+ clubs.add_wiki( recs )
170
+
171
+
172
+ m = clubs.match( 'Club Brugge KV' )
173
+ assert_equal 1, m.size
174
+ assert_equal 'Club Brugge KV', m[0].wikipedia
175
+ assert_equal 'https://en.wikipedia.org/wiki/Club_Brugge_KV', m[0].wikipedia_url
176
+
177
+ m = clubs.match( 'RSC Anderlecht' )
178
+ assert_equal 1, m.size
179
+ assert_equal 'R.S.C. Anderlecht', m[0].wikipedia
180
+ assert_equal 'https://en.wikipedia.org/wiki/R.S.C._Anderlecht', m[0].wikipedia_url
181
+ end
182
+
183
+ end # class TestClubIndex