sportdb-formats 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +21 -0
- data/lib/sportdb/formats.rb +63 -0
- data/lib/sportdb/formats/country/country_index.rb +192 -0
- data/lib/sportdb/formats/country/country_reader.rb +122 -0
- data/lib/sportdb/formats/league/league_index.rb +174 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +141 -0
- data/lib/sportdb/formats/league/league_reader.rb +162 -0
- data/lib/sportdb/formats/team/club_index.rb +336 -0
- data/lib/sportdb/formats/team/club_reader.rb +350 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +75 -0
- data/lib/sportdb/formats/team/national_team_index.rb +114 -0
- data/lib/sportdb/formats/team/team_index.rb +43 -0
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -0
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/helper.rb +72 -0
- data/test/test_club_index.rb +183 -0
- data/test/test_club_reader.rb +201 -0
- data/test/test_club_reader_props.rb +54 -0
- data/test/test_country_index.rb +63 -0
- data/test/test_country_reader.rb +59 -0
- data/test/test_league_index.rb +157 -0
- data/test/test_league_outline_reader.rb +55 -0
- data/test/test_league_reader.rb +72 -0
- data/test/test_regex.rb +49 -0
- data/test/test_wiki_reader.rb +77 -0
- metadata +22 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29715e2e61cd99fe3520e861b1d84c4614055650
|
4
|
+
data.tar.gz: e8109a80c7f79926c271560fd63f8503a44fabd2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8cb97f1cd4ae2d56e3b81282a7f921f99b2dcd325bafcf739e9d4c8a8bf9139fa99652a1a2804cf33604949b36b15694e7da68d851cef6d91b9b7dd727144bcf
|
7
|
+
data.tar.gz: 5967f5b9558d963cd9a6974be5b53ad343b8862c84ae4640bb6c3bd6975f80ab1a687946aa2a91aeeeecb9e5908537f530eb6e837553fccfc2d92f43db082edc
|
data/Manifest.txt
CHANGED
@@ -4,9 +4,14 @@ README.md
|
|
4
4
|
Rakefile
|
5
5
|
lib/sportdb/formats.rb
|
6
6
|
lib/sportdb/formats/config.rb
|
7
|
+
lib/sportdb/formats/country/country_index.rb
|
8
|
+
lib/sportdb/formats/country/country_reader.rb
|
7
9
|
lib/sportdb/formats/datafile.rb
|
8
10
|
lib/sportdb/formats/datafile_package.rb
|
9
11
|
lib/sportdb/formats/goals.rb
|
12
|
+
lib/sportdb/formats/league/league_index.rb
|
13
|
+
lib/sportdb/formats/league/league_outline_reader.rb
|
14
|
+
lib/sportdb/formats/league/league_reader.rb
|
10
15
|
lib/sportdb/formats/match/conf_parser.rb
|
11
16
|
lib/sportdb/formats/match/mapper.rb
|
12
17
|
lib/sportdb/formats/match/mapper_teams.rb
|
@@ -29,13 +34,27 @@ lib/sportdb/formats/structs/season.rb
|
|
29
34
|
lib/sportdb/formats/structs/standings.rb
|
30
35
|
lib/sportdb/formats/structs/team.rb
|
31
36
|
lib/sportdb/formats/structs/team_usage.rb
|
37
|
+
lib/sportdb/formats/team/club_index.rb
|
38
|
+
lib/sportdb/formats/team/club_reader.rb
|
39
|
+
lib/sportdb/formats/team/club_reader_props.rb
|
40
|
+
lib/sportdb/formats/team/national_team_index.rb
|
41
|
+
lib/sportdb/formats/team/team_index.rb
|
42
|
+
lib/sportdb/formats/team/wiki_reader.rb
|
32
43
|
lib/sportdb/formats/version.rb
|
33
44
|
test/helper.rb
|
45
|
+
test/test_club_index.rb
|
46
|
+
test/test_club_reader.rb
|
47
|
+
test/test_club_reader_props.rb
|
34
48
|
test/test_clubs.rb
|
35
49
|
test/test_conf.rb
|
50
|
+
test/test_country_index.rb
|
51
|
+
test/test_country_reader.rb
|
36
52
|
test/test_csv_reader.rb
|
37
53
|
test/test_datafile.rb
|
38
54
|
test/test_goals.rb
|
55
|
+
test/test_league_index.rb
|
56
|
+
test/test_league_outline_reader.rb
|
57
|
+
test/test_league_reader.rb
|
39
58
|
test/test_match.rb
|
40
59
|
test/test_match_auto.rb
|
41
60
|
test/test_match_auto_champs.rb
|
@@ -49,5 +68,7 @@ test/test_name_helper.rb
|
|
49
68
|
test/test_outline_reader.rb
|
50
69
|
test/test_package.rb
|
51
70
|
test/test_package_match.rb
|
71
|
+
test/test_regex.rb
|
52
72
|
test/test_scores.rb
|
53
73
|
test/test_season.rb
|
74
|
+
test/test_wiki_reader.rb
|
data/lib/sportdb/formats.rb
CHANGED
@@ -69,6 +69,69 @@ require 'sportdb/formats/match/match_parser_auto_conf'
|
|
69
69
|
require 'sportdb/formats/match/conf_parser'
|
70
70
|
|
71
71
|
|
72
|
+
require 'sportdb/formats/country/country_reader'
|
73
|
+
require 'sportdb/formats/country/country_index'
|
74
|
+
|
75
|
+
|
76
|
+
## add convenience helper
|
77
|
+
module SportDb
|
78
|
+
module Import
|
79
|
+
class Country
|
80
|
+
def self.read( path ) CountryReader.read( path ); end
|
81
|
+
def self.parse( txt ) CountryReader.parse( txt ); end
|
82
|
+
end # class Country
|
83
|
+
end # module Import
|
84
|
+
end # module SportDb
|
85
|
+
|
86
|
+
|
87
|
+
require 'sportdb/formats/league/league_reader'
|
88
|
+
require 'sportdb/formats/league/league_index'
|
89
|
+
require 'sportdb/formats/league/league_outline_reader'
|
90
|
+
|
91
|
+
##
|
92
|
+
## add convenience helper / short-cuts
|
93
|
+
module SportDb
|
94
|
+
module Import
|
95
|
+
class League
|
96
|
+
def self.read( path ) LeagueReader.read( path ); end
|
97
|
+
def self.parse( txt ) LeagueReader.parse( txt ); end
|
98
|
+
end # class League
|
99
|
+
end # module Import
|
100
|
+
end # module SportDb
|
101
|
+
|
102
|
+
|
103
|
+
require 'sportdb/formats/team/club_reader'
|
104
|
+
require 'sportdb/formats/team/club_reader_props'
|
105
|
+
require 'sportdb/formats/team/club_index'
|
106
|
+
require 'sportdb/formats/team/wiki_reader'
|
107
|
+
require 'sportdb/formats/team/national_team_index'
|
108
|
+
require 'sportdb/formats/team/team_index'
|
109
|
+
|
110
|
+
|
111
|
+
###
|
112
|
+
# add convenience helpers / shortcuts
|
113
|
+
module SportDb
|
114
|
+
module Import
|
115
|
+
class Club
|
116
|
+
def self.read( path ) ClubReader.read( path ); end
|
117
|
+
def self.parse( txt ) ClubReader.parse( txt ); end
|
118
|
+
|
119
|
+
def self.read_props( path ) ClubPropsReader.read( path ); end
|
120
|
+
def self.parse_props( txt ) ClubPropsReader.parse( txt ); end
|
121
|
+
## todo/check: use ClubProps.read and ClubProps.parse convenience alternate shortcuts - why? why not?
|
122
|
+
end # class Club
|
123
|
+
end # module Import
|
124
|
+
end # module SportDb
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
|
72
135
|
## let's put test configuration in its own namespace / module
|
73
136
|
module SportDb
|
74
137
|
class Test ## todo/check: works with module too? use a module - why? why not?
|
@@ -0,0 +1,192 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
## built-in countries for (quick starter) auto-add
|
7
|
+
class CountryIndex
|
8
|
+
|
9
|
+
attr_reader :countries ## all country records
|
10
|
+
|
11
|
+
def initialize( recs )
|
12
|
+
@countries = []
|
13
|
+
@countries_by_code = {}
|
14
|
+
@countries_by_name = {}
|
15
|
+
|
16
|
+
add( recs )
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
## helpers from country - use a helper module for includes (share with clubs etc.) - why? why not?
|
21
|
+
include NameHelper
|
22
|
+
## incl. strip_year( name )
|
23
|
+
## has_year?( name)
|
24
|
+
## strip_lang( name )
|
25
|
+
## normalize( name )
|
26
|
+
|
27
|
+
|
28
|
+
def add( recs )
|
29
|
+
###########################################
|
30
|
+
## auto-fill countries
|
31
|
+
## pp recs
|
32
|
+
recs.each do |rec|
|
33
|
+
## rec e.g. { key:'af', fifa:'AFG', name:'Afghanistan'}
|
34
|
+
|
35
|
+
@countries << rec
|
36
|
+
|
37
|
+
## add codes lookups - key, fifa, ...
|
38
|
+
if @countries_by_code[ rec.key ]
|
39
|
+
puts "** !! ERROR !! country code (key) >#{rec.key}< already exits!!"
|
40
|
+
exit 1
|
41
|
+
else
|
42
|
+
@countries_by_code[ rec.key ] = rec
|
43
|
+
end
|
44
|
+
|
45
|
+
## add fifa code (only) if different from key
|
46
|
+
if rec.key != rec.fifa.downcase
|
47
|
+
if @countries_by_code[ rec.fifa.downcase ]
|
48
|
+
puts "** !! ERROR !! country code (fifa) >#{rec.fifa.downcase}< already exits!!"
|
49
|
+
exit 1
|
50
|
+
else
|
51
|
+
@countries_by_code[ rec.fifa.downcase ] = rec
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
## add all names (canonical name + alt names
|
57
|
+
names = [rec.name] + rec.alt_names
|
58
|
+
more_names = []
|
59
|
+
## check "hand-typed" names for year (auto-add)
|
60
|
+
## check for year(s) e.g. (1887-1911), (-2013),
|
61
|
+
## (1946-2001,2013-) etc.
|
62
|
+
names.each do |name|
|
63
|
+
if has_year?( name )
|
64
|
+
more_names << strip_year( name )
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
names += more_names
|
69
|
+
## check for duplicates - simple check for now - fix/improve
|
70
|
+
## todo/fix: (auto)remove duplicates - why? why not?
|
71
|
+
count = names.size
|
72
|
+
count_uniq = names.uniq.size
|
73
|
+
if count != count_uniq
|
74
|
+
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
75
|
+
pp names
|
76
|
+
pp rec
|
77
|
+
exit 1
|
78
|
+
end
|
79
|
+
|
80
|
+
names.each_with_index do |name,i|
|
81
|
+
## check lang codes e.g. [en], [fr], etc.
|
82
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
83
|
+
name = strip_lang( name )
|
84
|
+
norm = normalize( name )
|
85
|
+
old_rec = @countries_by_name[ norm ]
|
86
|
+
if old_rec
|
87
|
+
## check if country name already is included or is new country rec
|
88
|
+
msg = "** !!! ERROR !!! - name conflict/duplicate - >#{name}< will overwrite >#{old_rec.name}< with >#{rec.name}<"
|
89
|
+
puts msg
|
90
|
+
exit 1
|
91
|
+
else
|
92
|
+
@countries_by_name[ norm ] = rec
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
end ## each record
|
97
|
+
end # method initialize
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
## fix/todo: add find_by (alias for find_by_name/find_by_code)
|
102
|
+
def find_by_code( code )
|
103
|
+
code = code.to_s.downcase ## allow symbols (and always downcase e.g. AUT to aut etc.)
|
104
|
+
@countries_by_code[ code ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def find_by_name( name )
|
108
|
+
name = normalize( name.to_s ) ## allow symbols too (e.g. use to.s first)
|
109
|
+
@countries_by_name[ name ]
|
110
|
+
end
|
111
|
+
|
112
|
+
def []( key )
|
113
|
+
country = find_by_code( key )
|
114
|
+
country = find_by_name( key ) if country.nil? ## try lookup / find by (normalized) name
|
115
|
+
country
|
116
|
+
end
|
117
|
+
alias_method :find, :[]
|
118
|
+
|
119
|
+
|
120
|
+
###
|
121
|
+
## split/parse country line
|
122
|
+
##
|
123
|
+
## split on bullet e.g.
|
124
|
+
## split into name and code with regex - make code optional
|
125
|
+
##
|
126
|
+
## Examples:
|
127
|
+
## Österreich • Austria (at)
|
128
|
+
## Österreich • Austria
|
129
|
+
## Austria
|
130
|
+
## Deutschland (de) • Germany
|
131
|
+
##
|
132
|
+
## todo/check: support more formats - why? why not?
|
133
|
+
## e.g. Austria, AUT (e.g. with comma - why? why not?)
|
134
|
+
def parse( line )
|
135
|
+
values = line.split( '•' ) ## use/support multi-lingual separator
|
136
|
+
country = nil
|
137
|
+
values.each do |value|
|
138
|
+
value = value.strip
|
139
|
+
## check for trailing country code e.g. (at), (eng), etc.
|
140
|
+
if value =~ /[ ]+\((?<code>[a-z]{1,4})\)$/ ## e.g. Austria (at)
|
141
|
+
code = $~[:code]
|
142
|
+
name = value[0...(value.size-code.size-2)].strip ## note: add -2 for brackets
|
143
|
+
candidates = [ find_by_code( code ), find_by_name( name ) ]
|
144
|
+
if candidates[0].nil?
|
145
|
+
puts "** !!! ERROR !!! country - unknown code >#{code}< in line: #{line}"
|
146
|
+
pp line
|
147
|
+
exit 1
|
148
|
+
end
|
149
|
+
if candidates[1].nil?
|
150
|
+
puts "** !!! ERROR !!! country - unknown name >#{code}< in line: #{line}"
|
151
|
+
pp line
|
152
|
+
exit 1
|
153
|
+
end
|
154
|
+
if candidates[0] != candidates[1]
|
155
|
+
puts "** !!! ERROR !!! country - name and code do NOT match the same country:"
|
156
|
+
pp line
|
157
|
+
pp candidates
|
158
|
+
exit 1
|
159
|
+
end
|
160
|
+
if country && country != candidates[0]
|
161
|
+
puts "** !!! ERROR !!! country - names do NOT match the same country:"
|
162
|
+
pp line
|
163
|
+
pp country
|
164
|
+
pp candidates
|
165
|
+
exit 1
|
166
|
+
end
|
167
|
+
country = candidates[0]
|
168
|
+
else
|
169
|
+
## just assume value is name or code
|
170
|
+
candidate = find( value )
|
171
|
+
if candidate.nil?
|
172
|
+
puts "** !!! ERROR !!! country - unknown name or code >#{value}< in line: #{line}"
|
173
|
+
pp line
|
174
|
+
exit 1
|
175
|
+
end
|
176
|
+
if country && country != candidate
|
177
|
+
puts "** !!! ERROR !!! country - names do NOT match the same country:"
|
178
|
+
pp line
|
179
|
+
pp country
|
180
|
+
pp candidate
|
181
|
+
exit 1
|
182
|
+
end
|
183
|
+
country = candidate
|
184
|
+
end
|
185
|
+
end
|
186
|
+
country
|
187
|
+
end # method parse
|
188
|
+
end # class CountryIndex
|
189
|
+
|
190
|
+
|
191
|
+
end # module Import
|
192
|
+
end # module SportDb
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class CountryReader
|
9
|
+
|
10
|
+
|
11
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
13
|
+
parse( txt )
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.parse( txt )
|
17
|
+
new( txt ).parse
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def initialize( txt )
|
22
|
+
@txt = txt
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse
|
26
|
+
countries = []
|
27
|
+
last_country = nil ## note/check/fix: use countries[-1] - why? why not?
|
28
|
+
|
29
|
+
OutlineReader.parse( @txt ).each do |node|
|
30
|
+
|
31
|
+
node_type = node[0]
|
32
|
+
|
33
|
+
if [:h1, :h2].include?( node_type )
|
34
|
+
## skip headings (and headings) for now too
|
35
|
+
elsif node_type == :p ## paragraph
|
36
|
+
lines = node[1]
|
37
|
+
lines.each do |line|
|
38
|
+
if line.start_with?( '|' )
|
39
|
+
## assume continuation with line of alternative names
|
40
|
+
## note: skip leading pipe
|
41
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
42
|
+
## strip and squish (white)spaces
|
43
|
+
# e.g. East Germany (-1989) => East Germany (-1989)
|
44
|
+
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
45
|
+
last_country.alt_names += values
|
46
|
+
else
|
47
|
+
## assume "regular" line
|
48
|
+
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
49
|
+
## note: allow country codes upto 4 (!!) e.g. Northern Cyprus
|
50
|
+
if line =~ /^([a-z]{2,4})[ ]+(.+)$/
|
51
|
+
key = $1
|
52
|
+
values = $2.split( ',' )
|
53
|
+
## strip and squish (white)spaces
|
54
|
+
# e.g. East Germany (-1989) => East Germany (-1989)
|
55
|
+
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
56
|
+
|
57
|
+
## note: remove "overlords" from geo-tree marked territories e.g. UK, US, etc. from name
|
58
|
+
## e.g. England › UK => England
|
59
|
+
## Puerto Rico › US => Puerto Rico
|
60
|
+
geos = split_geo( values[0] )
|
61
|
+
name = geos[0] ## note: ignore all other geos for now
|
62
|
+
|
63
|
+
## note: allow fifa country codes upto 4 (!!) e.g. Northern Cyprus
|
64
|
+
fifa = if values[1] && values[1] =~ /^[A-Z]{3,4}$/ ## note: also check format
|
65
|
+
values[1]
|
66
|
+
else
|
67
|
+
if values[1]
|
68
|
+
puts "** !!! ERROR !!! wrong fifa code format >#{values[1]}<; expected three (or four)-letter all up-case"
|
69
|
+
else
|
70
|
+
puts "** !!! ERROR !!! missing fifa code for (canonical) country name"
|
71
|
+
end
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
tags = if values[2] ## check if tags presents
|
76
|
+
split_tags( values[2] )
|
77
|
+
else
|
78
|
+
[]
|
79
|
+
end
|
80
|
+
|
81
|
+
last_country = country = Country.new( key: key,
|
82
|
+
name: name,
|
83
|
+
fifa: fifa,
|
84
|
+
tags: tags )
|
85
|
+
countries << country
|
86
|
+
else
|
87
|
+
puts "** !! ERROR - missing key for (canonical) country name"
|
88
|
+
exit 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end # each line
|
92
|
+
else
|
93
|
+
puts "** !! ERROR - unknown node type / (input) source line:"
|
94
|
+
pp node
|
95
|
+
exit 1
|
96
|
+
end
|
97
|
+
end # each node
|
98
|
+
|
99
|
+
countries
|
100
|
+
end # method parse
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
#######################################
|
105
|
+
## helpers
|
106
|
+
def split_tags( str )
|
107
|
+
tags = str.split( /[|<>‹›]/ ) ## allow pipe (|) and (<>‹›) as divider for now - add more? why? why not?
|
108
|
+
tags = tags.map { |tag| tag.strip }
|
109
|
+
tags
|
110
|
+
end
|
111
|
+
|
112
|
+
def split_geo( str ) ## todo/check: rename to parse_geo(s) - why? why not?
|
113
|
+
## split into geo tree
|
114
|
+
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹ for now
|
115
|
+
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
116
|
+
geos
|
117
|
+
end
|
118
|
+
|
119
|
+
end # class CountryReader
|
120
|
+
|
121
|
+
end # module Import
|
122
|
+
end # module SportDb
|