sportdb-formats 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +21 -0
- data/lib/sportdb/formats.rb +63 -0
- data/lib/sportdb/formats/country/country_index.rb +192 -0
- data/lib/sportdb/formats/country/country_reader.rb +122 -0
- data/lib/sportdb/formats/league/league_index.rb +174 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +141 -0
- data/lib/sportdb/formats/league/league_reader.rb +162 -0
- data/lib/sportdb/formats/team/club_index.rb +336 -0
- data/lib/sportdb/formats/team/club_reader.rb +350 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +75 -0
- data/lib/sportdb/formats/team/national_team_index.rb +114 -0
- data/lib/sportdb/formats/team/team_index.rb +43 -0
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -0
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/helper.rb +72 -0
- data/test/test_club_index.rb +183 -0
- data/test/test_club_reader.rb +201 -0
- data/test/test_club_reader_props.rb +54 -0
- data/test/test_country_index.rb +63 -0
- data/test/test_country_reader.rb +59 -0
- data/test/test_league_index.rb +157 -0
- data/test/test_league_outline_reader.rb +55 -0
- data/test/test_league_reader.rb +72 -0
- data/test/test_regex.rb +49 -0
- data/test/test_wiki_reader.rb +77 -0
- metadata +22 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29715e2e61cd99fe3520e861b1d84c4614055650
|
4
|
+
data.tar.gz: e8109a80c7f79926c271560fd63f8503a44fabd2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8cb97f1cd4ae2d56e3b81282a7f921f99b2dcd325bafcf739e9d4c8a8bf9139fa99652a1a2804cf33604949b36b15694e7da68d851cef6d91b9b7dd727144bcf
|
7
|
+
data.tar.gz: 5967f5b9558d963cd9a6974be5b53ad343b8862c84ae4640bb6c3bd6975f80ab1a687946aa2a91aeeeecb9e5908537f530eb6e837553fccfc2d92f43db082edc
|
data/Manifest.txt
CHANGED
@@ -4,9 +4,14 @@ README.md
|
|
4
4
|
Rakefile
|
5
5
|
lib/sportdb/formats.rb
|
6
6
|
lib/sportdb/formats/config.rb
|
7
|
+
lib/sportdb/formats/country/country_index.rb
|
8
|
+
lib/sportdb/formats/country/country_reader.rb
|
7
9
|
lib/sportdb/formats/datafile.rb
|
8
10
|
lib/sportdb/formats/datafile_package.rb
|
9
11
|
lib/sportdb/formats/goals.rb
|
12
|
+
lib/sportdb/formats/league/league_index.rb
|
13
|
+
lib/sportdb/formats/league/league_outline_reader.rb
|
14
|
+
lib/sportdb/formats/league/league_reader.rb
|
10
15
|
lib/sportdb/formats/match/conf_parser.rb
|
11
16
|
lib/sportdb/formats/match/mapper.rb
|
12
17
|
lib/sportdb/formats/match/mapper_teams.rb
|
@@ -29,13 +34,27 @@ lib/sportdb/formats/structs/season.rb
|
|
29
34
|
lib/sportdb/formats/structs/standings.rb
|
30
35
|
lib/sportdb/formats/structs/team.rb
|
31
36
|
lib/sportdb/formats/structs/team_usage.rb
|
37
|
+
lib/sportdb/formats/team/club_index.rb
|
38
|
+
lib/sportdb/formats/team/club_reader.rb
|
39
|
+
lib/sportdb/formats/team/club_reader_props.rb
|
40
|
+
lib/sportdb/formats/team/national_team_index.rb
|
41
|
+
lib/sportdb/formats/team/team_index.rb
|
42
|
+
lib/sportdb/formats/team/wiki_reader.rb
|
32
43
|
lib/sportdb/formats/version.rb
|
33
44
|
test/helper.rb
|
45
|
+
test/test_club_index.rb
|
46
|
+
test/test_club_reader.rb
|
47
|
+
test/test_club_reader_props.rb
|
34
48
|
test/test_clubs.rb
|
35
49
|
test/test_conf.rb
|
50
|
+
test/test_country_index.rb
|
51
|
+
test/test_country_reader.rb
|
36
52
|
test/test_csv_reader.rb
|
37
53
|
test/test_datafile.rb
|
38
54
|
test/test_goals.rb
|
55
|
+
test/test_league_index.rb
|
56
|
+
test/test_league_outline_reader.rb
|
57
|
+
test/test_league_reader.rb
|
39
58
|
test/test_match.rb
|
40
59
|
test/test_match_auto.rb
|
41
60
|
test/test_match_auto_champs.rb
|
@@ -49,5 +68,7 @@ test/test_name_helper.rb
|
|
49
68
|
test/test_outline_reader.rb
|
50
69
|
test/test_package.rb
|
51
70
|
test/test_package_match.rb
|
71
|
+
test/test_regex.rb
|
52
72
|
test/test_scores.rb
|
53
73
|
test/test_season.rb
|
74
|
+
test/test_wiki_reader.rb
|
data/lib/sportdb/formats.rb
CHANGED
@@ -69,6 +69,69 @@ require 'sportdb/formats/match/match_parser_auto_conf'
|
|
69
69
|
require 'sportdb/formats/match/conf_parser'
|
70
70
|
|
71
71
|
|
72
|
+
require 'sportdb/formats/country/country_reader'
|
73
|
+
require 'sportdb/formats/country/country_index'
|
74
|
+
|
75
|
+
|
76
|
+
## add convenience helper
|
77
|
+
module SportDb
|
78
|
+
module Import
|
79
|
+
class Country
|
80
|
+
def self.read( path ) CountryReader.read( path ); end
|
81
|
+
def self.parse( txt ) CountryReader.parse( txt ); end
|
82
|
+
end # class Country
|
83
|
+
end # module Import
|
84
|
+
end # module SportDb
|
85
|
+
|
86
|
+
|
87
|
+
require 'sportdb/formats/league/league_reader'
|
88
|
+
require 'sportdb/formats/league/league_index'
|
89
|
+
require 'sportdb/formats/league/league_outline_reader'
|
90
|
+
|
91
|
+
##
|
92
|
+
## add convenience helper / short-cuts
|
93
|
+
module SportDb
|
94
|
+
module Import
|
95
|
+
class League
|
96
|
+
def self.read( path ) LeagueReader.read( path ); end
|
97
|
+
def self.parse( txt ) LeagueReader.parse( txt ); end
|
98
|
+
end # class League
|
99
|
+
end # module Import
|
100
|
+
end # module SportDb
|
101
|
+
|
102
|
+
|
103
|
+
require 'sportdb/formats/team/club_reader'
|
104
|
+
require 'sportdb/formats/team/club_reader_props'
|
105
|
+
require 'sportdb/formats/team/club_index'
|
106
|
+
require 'sportdb/formats/team/wiki_reader'
|
107
|
+
require 'sportdb/formats/team/national_team_index'
|
108
|
+
require 'sportdb/formats/team/team_index'
|
109
|
+
|
110
|
+
|
111
|
+
###
|
112
|
+
# add convenience helpers / shortcuts
|
113
|
+
module SportDb
|
114
|
+
module Import
|
115
|
+
class Club
|
116
|
+
def self.read( path ) ClubReader.read( path ); end
|
117
|
+
def self.parse( txt ) ClubReader.parse( txt ); end
|
118
|
+
|
119
|
+
def self.read_props( path ) ClubPropsReader.read( path ); end
|
120
|
+
def self.parse_props( txt ) ClubPropsReader.parse( txt ); end
|
121
|
+
## todo/check: use ClubProps.read and ClubProps.parse convenience alternate shortcuts - why? why not?
|
122
|
+
end # class Club
|
123
|
+
end # module Import
|
124
|
+
end # module SportDb
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
|
72
135
|
## let's put test configuration in its own namespace / module
|
73
136
|
module SportDb
|
74
137
|
class Test ## todo/check: works with module too? use a module - why? why not?
|
@@ -0,0 +1,192 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
## built-in countries for (quick starter) auto-add
|
7
|
+
class CountryIndex
|
8
|
+
|
9
|
+
attr_reader :countries ## all country records
|
10
|
+
|
11
|
+
def initialize( recs )
|
12
|
+
@countries = []
|
13
|
+
@countries_by_code = {}
|
14
|
+
@countries_by_name = {}
|
15
|
+
|
16
|
+
add( recs )
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
## helpers from country - use a helper module for includes (share with clubs etc.) - why? why not?
|
21
|
+
include NameHelper
|
22
|
+
## incl. strip_year( name )
|
23
|
+
## has_year?( name)
|
24
|
+
## strip_lang( name )
|
25
|
+
## normalize( name )
|
26
|
+
|
27
|
+
|
28
|
+
def add( recs )
|
29
|
+
###########################################
|
30
|
+
## auto-fill countries
|
31
|
+
## pp recs
|
32
|
+
recs.each do |rec|
|
33
|
+
## rec e.g. { key:'af', fifa:'AFG', name:'Afghanistan'}
|
34
|
+
|
35
|
+
@countries << rec
|
36
|
+
|
37
|
+
## add codes lookups - key, fifa, ...
|
38
|
+
if @countries_by_code[ rec.key ]
|
39
|
+
puts "** !! ERROR !! country code (key) >#{rec.key}< already exits!!"
|
40
|
+
exit 1
|
41
|
+
else
|
42
|
+
@countries_by_code[ rec.key ] = rec
|
43
|
+
end
|
44
|
+
|
45
|
+
## add fifa code (only) if different from key
|
46
|
+
if rec.key != rec.fifa.downcase
|
47
|
+
if @countries_by_code[ rec.fifa.downcase ]
|
48
|
+
puts "** !! ERROR !! country code (fifa) >#{rec.fifa.downcase}< already exits!!"
|
49
|
+
exit 1
|
50
|
+
else
|
51
|
+
@countries_by_code[ rec.fifa.downcase ] = rec
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
## add all names (canonical name + alt names
|
57
|
+
names = [rec.name] + rec.alt_names
|
58
|
+
more_names = []
|
59
|
+
## check "hand-typed" names for year (auto-add)
|
60
|
+
## check for year(s) e.g. (1887-1911), (-2013),
|
61
|
+
## (1946-2001,2013-) etc.
|
62
|
+
names.each do |name|
|
63
|
+
if has_year?( name )
|
64
|
+
more_names << strip_year( name )
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
names += more_names
|
69
|
+
## check for duplicates - simple check for now - fix/improve
|
70
|
+
## todo/fix: (auto)remove duplicates - why? why not?
|
71
|
+
count = names.size
|
72
|
+
count_uniq = names.uniq.size
|
73
|
+
if count != count_uniq
|
74
|
+
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
75
|
+
pp names
|
76
|
+
pp rec
|
77
|
+
exit 1
|
78
|
+
end
|
79
|
+
|
80
|
+
names.each_with_index do |name,i|
|
81
|
+
## check lang codes e.g. [en], [fr], etc.
|
82
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
83
|
+
name = strip_lang( name )
|
84
|
+
norm = normalize( name )
|
85
|
+
old_rec = @countries_by_name[ norm ]
|
86
|
+
if old_rec
|
87
|
+
## check if country name already is included or is new country rec
|
88
|
+
msg = "** !!! ERROR !!! - name conflict/duplicate - >#{name}< will overwrite >#{old_rec.name}< with >#{rec.name}<"
|
89
|
+
puts msg
|
90
|
+
exit 1
|
91
|
+
else
|
92
|
+
@countries_by_name[ norm ] = rec
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
end ## each record
|
97
|
+
end # method initialize
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
## fix/todo: add find_by (alias for find_by_name/find_by_code)
|
102
|
+
def find_by_code( code )
|
103
|
+
code = code.to_s.downcase ## allow symbols (and always downcase e.g. AUT to aut etc.)
|
104
|
+
@countries_by_code[ code ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def find_by_name( name )
|
108
|
+
name = normalize( name.to_s ) ## allow symbols too (e.g. use to.s first)
|
109
|
+
@countries_by_name[ name ]
|
110
|
+
end
|
111
|
+
|
112
|
+
def []( key )
|
113
|
+
country = find_by_code( key )
|
114
|
+
country = find_by_name( key ) if country.nil? ## try lookup / find by (normalized) name
|
115
|
+
country
|
116
|
+
end
|
117
|
+
alias_method :find, :[]
|
118
|
+
|
119
|
+
|
120
|
+
###
|
121
|
+
## split/parse country line
|
122
|
+
##
|
123
|
+
## split on bullet e.g.
|
124
|
+
## split into name and code with regex - make code optional
|
125
|
+
##
|
126
|
+
## Examples:
|
127
|
+
## Österreich • Austria (at)
|
128
|
+
## Österreich • Austria
|
129
|
+
## Austria
|
130
|
+
## Deutschland (de) • Germany
|
131
|
+
##
|
132
|
+
## todo/check: support more formats - why? why not?
|
133
|
+
## e.g. Austria, AUT (e.g. with comma - why? why not?)
|
134
|
+
def parse( line )
|
135
|
+
values = line.split( '•' ) ## use/support multi-lingual separator
|
136
|
+
country = nil
|
137
|
+
values.each do |value|
|
138
|
+
value = value.strip
|
139
|
+
## check for trailing country code e.g. (at), (eng), etc.
|
140
|
+
if value =~ /[ ]+\((?<code>[a-z]{1,4})\)$/ ## e.g. Austria (at)
|
141
|
+
code = $~[:code]
|
142
|
+
name = value[0...(value.size-code.size-2)].strip ## note: add -2 for brackets
|
143
|
+
candidates = [ find_by_code( code ), find_by_name( name ) ]
|
144
|
+
if candidates[0].nil?
|
145
|
+
puts "** !!! ERROR !!! country - unknown code >#{code}< in line: #{line}"
|
146
|
+
pp line
|
147
|
+
exit 1
|
148
|
+
end
|
149
|
+
if candidates[1].nil?
|
150
|
+
puts "** !!! ERROR !!! country - unknown name >#{code}< in line: #{line}"
|
151
|
+
pp line
|
152
|
+
exit 1
|
153
|
+
end
|
154
|
+
if candidates[0] != candidates[1]
|
155
|
+
puts "** !!! ERROR !!! country - name and code do NOT match the same country:"
|
156
|
+
pp line
|
157
|
+
pp candidates
|
158
|
+
exit 1
|
159
|
+
end
|
160
|
+
if country && country != candidates[0]
|
161
|
+
puts "** !!! ERROR !!! country - names do NOT match the same country:"
|
162
|
+
pp line
|
163
|
+
pp country
|
164
|
+
pp candidates
|
165
|
+
exit 1
|
166
|
+
end
|
167
|
+
country = candidates[0]
|
168
|
+
else
|
169
|
+
## just assume value is name or code
|
170
|
+
candidate = find( value )
|
171
|
+
if candidate.nil?
|
172
|
+
puts "** !!! ERROR !!! country - unknown name or code >#{value}< in line: #{line}"
|
173
|
+
pp line
|
174
|
+
exit 1
|
175
|
+
end
|
176
|
+
if country && country != candidate
|
177
|
+
puts "** !!! ERROR !!! country - names do NOT match the same country:"
|
178
|
+
pp line
|
179
|
+
pp country
|
180
|
+
pp candidate
|
181
|
+
exit 1
|
182
|
+
end
|
183
|
+
country = candidate
|
184
|
+
end
|
185
|
+
end
|
186
|
+
country
|
187
|
+
end # method parse
|
188
|
+
end # class CountryIndex
|
189
|
+
|
190
|
+
|
191
|
+
end # module Import
|
192
|
+
end # module SportDb
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class CountryReader
|
9
|
+
|
10
|
+
|
11
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
13
|
+
parse( txt )
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.parse( txt )
|
17
|
+
new( txt ).parse
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def initialize( txt )
|
22
|
+
@txt = txt
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse
|
26
|
+
countries = []
|
27
|
+
last_country = nil ## note/check/fix: use countries[-1] - why? why not?
|
28
|
+
|
29
|
+
OutlineReader.parse( @txt ).each do |node|
|
30
|
+
|
31
|
+
node_type = node[0]
|
32
|
+
|
33
|
+
if [:h1, :h2].include?( node_type )
|
34
|
+
## skip headings (and headings) for now too
|
35
|
+
elsif node_type == :p ## paragraph
|
36
|
+
lines = node[1]
|
37
|
+
lines.each do |line|
|
38
|
+
if line.start_with?( '|' )
|
39
|
+
## assume continuation with line of alternative names
|
40
|
+
## note: skip leading pipe
|
41
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
42
|
+
## strip and squish (white)spaces
|
43
|
+
# e.g. East Germany (-1989) => East Germany (-1989)
|
44
|
+
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
45
|
+
last_country.alt_names += values
|
46
|
+
else
|
47
|
+
## assume "regular" line
|
48
|
+
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
49
|
+
## note: allow country codes upto 4 (!!) e.g. Northern Cyprus
|
50
|
+
if line =~ /^([a-z]{2,4})[ ]+(.+)$/
|
51
|
+
key = $1
|
52
|
+
values = $2.split( ',' )
|
53
|
+
## strip and squish (white)spaces
|
54
|
+
# e.g. East Germany (-1989) => East Germany (-1989)
|
55
|
+
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
56
|
+
|
57
|
+
## note: remove "overlords" from geo-tree marked territories e.g. UK, US, etc. from name
|
58
|
+
## e.g. England › UK => England
|
59
|
+
## Puerto Rico › US => Puerto Rico
|
60
|
+
geos = split_geo( values[0] )
|
61
|
+
name = geos[0] ## note: ignore all other geos for now
|
62
|
+
|
63
|
+
## note: allow fifa country codes upto 4 (!!) e.g. Northern Cyprus
|
64
|
+
fifa = if values[1] && values[1] =~ /^[A-Z]{3,4}$/ ## note: also check format
|
65
|
+
values[1]
|
66
|
+
else
|
67
|
+
if values[1]
|
68
|
+
puts "** !!! ERROR !!! wrong fifa code format >#{values[1]}<; expected three (or four)-letter all up-case"
|
69
|
+
else
|
70
|
+
puts "** !!! ERROR !!! missing fifa code for (canonical) country name"
|
71
|
+
end
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
tags = if values[2] ## check if tags presents
|
76
|
+
split_tags( values[2] )
|
77
|
+
else
|
78
|
+
[]
|
79
|
+
end
|
80
|
+
|
81
|
+
last_country = country = Country.new( key: key,
|
82
|
+
name: name,
|
83
|
+
fifa: fifa,
|
84
|
+
tags: tags )
|
85
|
+
countries << country
|
86
|
+
else
|
87
|
+
puts "** !! ERROR - missing key for (canonical) country name"
|
88
|
+
exit 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end # each line
|
92
|
+
else
|
93
|
+
puts "** !! ERROR - unknown node type / (input) source line:"
|
94
|
+
pp node
|
95
|
+
exit 1
|
96
|
+
end
|
97
|
+
end # each node
|
98
|
+
|
99
|
+
countries
|
100
|
+
end # method parse
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
#######################################
|
105
|
+
## helpers
|
106
|
+
def split_tags( str )
|
107
|
+
tags = str.split( /[|<>‹›]/ ) ## allow pipe (|) and (<>‹›) as divider for now - add more? why? why not?
|
108
|
+
tags = tags.map { |tag| tag.strip }
|
109
|
+
tags
|
110
|
+
end
|
111
|
+
|
112
|
+
def split_geo( str ) ## todo/check: rename to parse_geo(s) - why? why not?
|
113
|
+
## split into geo tree
|
114
|
+
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹ for now
|
115
|
+
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
116
|
+
geos
|
117
|
+
end
|
118
|
+
|
119
|
+
end # class CountryReader
|
120
|
+
|
121
|
+
end # module Import
|
122
|
+
end # module SportDb
|