sportdb-config 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/leagues/fr.txt +9 -9
- data/config/leagues/gr.txt +7 -7
- data/config/leagues/sco.txt +19 -19
- data/config/world/eng.txt +162 -162
- data/lib/sportdb/config/club_reader.rb +278 -278
- data/lib/sportdb/config/clubs.rb +7 -0
- data/lib/sportdb/config/config.rb +123 -123
- data/lib/sportdb/config/league.rb +118 -118
- data/lib/sportdb/config/league_reader.rb +65 -65
- data/lib/sportdb/config/league_utils.rb +24 -24
- data/lib/sportdb/config/variants.rb +91 -81
- data/lib/sportdb/config/version.rb +1 -1
- data/test/test_club_reader.rb +150 -150
- data/test/test_league_reader.rb +54 -54
- data/test/test_league_utils.rb +46 -46
- data/test/test_season_utils.rb +29 -29
- data/test/test_variants.rb +14 -0
- metadata +12 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5cc9b8eea5116ab13e92c7a5a6b3b7962fa3f1c1
|
4
|
+
data.tar.gz: a94f4452d3398743f2e6283cead72ccc552d2ec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6db0746eda38483b0bfcc241730044805c9332a0f2b27ad2aa5b96586371121579c8ef75a1903d482a7c1f68534a4231877a24fedaf87a866399e9f2105439f5
|
7
|
+
data.tar.gz: c5803e3626a36b8949d6b774e2b43748c27fa9c819a7ac9984c633f613f8b81ad1c9a026c0bafc3f1c66023d75b937675d6196ea01167b47f89ece4c14dde6fe
|
data/config/leagues/fr.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
1 => ligue1, French Ligue 1
|
4
|
-
2 => ligue2, French Ligue 2
|
5
|
-
|
6
|
-
|
7
|
-
[2001-02] ## until (including) 2001-02 season
|
8
|
-
1 => division1, ? ## use championat or something? check official name
|
9
|
-
2 => division2, ?
|
1
|
+
|
2
|
+
|
3
|
+
1 => ligue1, French Ligue 1
|
4
|
+
2 => ligue2, French Ligue 2
|
5
|
+
|
6
|
+
|
7
|
+
[2001-02] ## until (including) 2001-02 season
|
8
|
+
1 => division1, ? ## use championat or something? check official name
|
9
|
+
2 => division2, ?
|
data/config/leagues/gr.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
1 => superleague, Greek Superleague
|
4
|
-
|
5
|
-
|
6
|
-
[2005-06] ## until (including) 2005-06 season
|
7
|
-
1 => alphaethniki, Greek Alpha Ethniki
|
1
|
+
|
2
|
+
|
3
|
+
1 => superleague, Greek Superleague
|
4
|
+
|
5
|
+
|
6
|
+
[2005-06] ## until (including) 2005-06 season
|
7
|
+
1 => alphaethniki, Greek Alpha Ethniki
|
data/config/leagues/sco.txt
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
|
2
|
-
1 => premiership, Scotish Premiership # starting w/ 2013-14 season
|
3
|
-
2 => championship, Scotish Championship
|
4
|
-
3 => league1, Scotish League One
|
5
|
-
4 => league2, Scotish League Two
|
6
|
-
|
7
|
-
|
8
|
-
[2012-13] ## until (including) 2012-13 season
|
9
|
-
1 => premierleague, Scotish Premiership
|
10
|
-
2 => division1, Scotish 1st Division
|
11
|
-
3 => division2, Scotish League One
|
12
|
-
4 => division3, Scotish League Two
|
13
|
-
|
14
|
-
|
15
|
-
[1997-98] ## until (including) season
|
16
|
-
1 => premierdivision, Scotish Premier Division
|
17
|
-
2 => division1, Scotish 1st Division
|
18
|
-
3 => division2, Scotish 2nd Division
|
19
|
-
4 => division3, Scotish 3rd Division
|
1
|
+
|
2
|
+
1 => premiership, Scotish Premiership # starting w/ 2013-14 season
|
3
|
+
2 => championship, Scotish Championship
|
4
|
+
3 => league1, Scotish League One
|
5
|
+
4 => league2, Scotish League Two
|
6
|
+
|
7
|
+
|
8
|
+
[2012-13] ## until (including) 2012-13 season
|
9
|
+
1 => premierleague, Scotish Premiership
|
10
|
+
2 => division1, Scotish 1st Division
|
11
|
+
3 => division2, Scotish League One
|
12
|
+
4 => division3, Scotish League Two
|
13
|
+
|
14
|
+
|
15
|
+
[1997-98] ## until (including) season
|
16
|
+
1 => premierdivision, Scotish Premier Division
|
17
|
+
2 => division1, Scotish 1st Division
|
18
|
+
3 => division2, Scotish 2nd Division
|
19
|
+
4 => division3, Scotish 3rd Division
|
data/config/world/eng.txt
CHANGED
@@ -1,162 +1,162 @@
|
|
1
|
-
##
|
2
|
-
# what name? use regions or maps or geos or zones or __?
|
3
|
-
|
4
|
-
#
|
5
|
-
# england
|
6
|
-
# see https://en.wikipedia.org/wiki/Subdivisions_of_England
|
7
|
-
# see https://www.bbc.co.uk/news/england/regions
|
8
|
-
|
9
|
-
##
|
10
|
-
# todo: for sort order - allow different sort name
|
11
|
-
# e.g. East Sussex => Sussex (East) or Sussex, East or something
|
12
|
-
# use <> for marking what counts for sorting
|
13
|
-
# e.g East <Sussex> => get auto-converted to Sussex, East - why? why not?
|
14
|
-
#
|
15
|
-
# more examples:
|
16
|
-
# North ‹Yorkshire› => Yorkshire (North) or Yorkshire, North
|
17
|
-
# Greater ‹London› => London, Greater
|
18
|
-
# Greater ‹Manchaster› => Manchaster, Greater
|
19
|
-
|
20
|
-
|
21
|
-
##
|
22
|
-
# note: uses all regions following the camra good beer guide (book)
|
23
|
-
#
|
24
|
-
|
25
|
-
======================================
|
26
|
-
== North West England ==
|
27
|
-
|
28
|
-
# Cumbria
|
29
|
-
# Lancashire
|
30
|
-
# Liverpool
|
31
|
-
# Manchester
|
32
|
-
|
33
|
-
Cheshire
|
34
|
-
Cumbria
|
35
|
-
Lancashire
|
36
|
-
Greater Manchester | Manchester, Greater
|
37
|
-
Liverpool & Merseyside | Merseyside ## note: added Liverpool & - why? why not?
|
38
|
-
|
39
|
-
|
40
|
-
===================================
|
41
|
-
== North East England
|
42
|
-
|
43
|
-
# Tees
|
44
|
-
# Tyne & Wear
|
45
|
-
|
46
|
-
Durham
|
47
|
-
Newcastle & Northumberland | Northumberland ## note: added Newcastle & - why? why not?
|
48
|
-
Tyne and Wear
|
49
|
-
|
50
|
-
|
51
|
-
=====================================
|
52
|
-
== Yorkshire & Lincolnshire ==
|
53
|
-
|
54
|
-
# Humberside
|
55
|
-
# Leeds & West Yorkshire
|
56
|
-
# Lincolnshire
|
57
|
-
# Sheffield & South Yorkshire
|
58
|
-
# York & North Yorkshire
|
59
|
-
|
60
|
-
Lincolnshire
|
61
|
-
West Yorkshire | Yorkshire, West | Yorkshire (West)
|
62
|
-
South Yorkshire | Yorkshire, South | Yorkshire (South)
|
63
|
-
North Yorkshire | Yorkshire, North | Yorkshire (North)
|
64
|
-
East Yorkshire | Yorkshire, East | Yorkshire (East)
|
65
|
-
|
66
|
-
|
67
|
-
==================================
|
68
|
-
== West Midlands
|
69
|
-
|
70
|
-
# Birmingham & Black Country
|
71
|
-
# Coventry & Warwickshire
|
72
|
-
# Hereford & Worcester
|
73
|
-
# Shropshire
|
74
|
-
# Stoke & Staffordshire
|
75
|
-
|
76
|
-
Herefordshire
|
77
|
-
Shropshire
|
78
|
-
Staffordshire
|
79
|
-
Warwickshire
|
80
|
-
Birmingham & West Midlands | West Midlands ## note: added Birmingham - why? why not?
|
81
|
-
Worcestershire
|
82
|
-
|
83
|
-
|
84
|
-
==================================
|
85
|
-
== East Midlands
|
86
|
-
|
87
|
-
# Derby
|
88
|
-
# Leicester
|
89
|
-
# Northampton
|
90
|
-
# Nottingham
|
91
|
-
|
92
|
-
Derbyshire
|
93
|
-
Leicestershire
|
94
|
-
Northamptonshire
|
95
|
-
Nottinghamshire
|
96
|
-
Rutland
|
97
|
-
|
98
|
-
|
99
|
-
===================================
|
100
|
-
== West & South West
|
101
|
-
|
102
|
-
# Bristol
|
103
|
-
# Cornwall
|
104
|
-
# Devon
|
105
|
-
# Gloucestershire
|
106
|
-
# Somerset
|
107
|
-
# Wiltshire
|
108
|
-
|
109
|
-
|
110
|
-
Cornwall
|
111
|
-
Devon
|
112
|
-
Bristol & Gloucestershire | Gloucestershire
|
113
|
-
Somerset
|
114
|
-
Wiltshire
|
115
|
-
|
116
|
-
|
117
|
-
=================================
|
118
|
-
== East
|
119
|
-
|
120
|
-
# Beds, Herts & Bucks
|
121
|
-
# Cambridgeshire
|
122
|
-
# Essex
|
123
|
-
# Norfolk
|
124
|
-
# Suffolk
|
125
|
-
|
126
|
-
Bedfordshire
|
127
|
-
Hertfordshire
|
128
|
-
Buckinghamshire
|
129
|
-
Cambridgeshire
|
130
|
-
Essex
|
131
|
-
Norfolk
|
132
|
-
Suffolk
|
133
|
-
|
134
|
-
|
135
|
-
===============================
|
136
|
-
== South
|
137
|
-
|
138
|
-
# Berkshire
|
139
|
-
# Dorset
|
140
|
-
# Hampshire & Isle of Wight
|
141
|
-
# Oxford
|
142
|
-
|
143
|
-
Berkshire
|
144
|
-
Dorset
|
145
|
-
Hampshire
|
146
|
-
Oxfordshire
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
===============================
|
151
|
-
== London & South East
|
152
|
-
|
153
|
-
# Kent
|
154
|
-
# London
|
155
|
-
# Surrey
|
156
|
-
# Sussex
|
157
|
-
|
158
|
-
Greater London | London, Greater
|
159
|
-
Kent
|
160
|
-
Surrey
|
161
|
-
East Sussex | Sussex, East | Sussex (East)
|
162
|
-
West Sussex | Sussex, West | Sussex (West)
|
1
|
+
##
|
2
|
+
# what name? use regions or maps or geos or zones or __?
|
3
|
+
|
4
|
+
#
|
5
|
+
# england
|
6
|
+
# see https://en.wikipedia.org/wiki/Subdivisions_of_England
|
7
|
+
# see https://www.bbc.co.uk/news/england/regions
|
8
|
+
|
9
|
+
##
|
10
|
+
# todo: for sort order - allow different sort name
|
11
|
+
# e.g. East Sussex => Sussex (East) or Sussex, East or something
|
12
|
+
# use <> for marking what counts for sorting
|
13
|
+
# e.g East <Sussex> => get auto-converted to Sussex, East - why? why not?
|
14
|
+
#
|
15
|
+
# more examples:
|
16
|
+
# North ‹Yorkshire› => Yorkshire (North) or Yorkshire, North
|
17
|
+
# Greater ‹London› => London, Greater
|
18
|
+
# Greater ‹Manchaster› => Manchaster, Greater
|
19
|
+
|
20
|
+
|
21
|
+
##
|
22
|
+
# note: uses all regions following the camra good beer guide (book)
|
23
|
+
#
|
24
|
+
|
25
|
+
======================================
|
26
|
+
== North West England ==
|
27
|
+
|
28
|
+
# Cumbria
|
29
|
+
# Lancashire
|
30
|
+
# Liverpool
|
31
|
+
# Manchester
|
32
|
+
|
33
|
+
Cheshire
|
34
|
+
Cumbria
|
35
|
+
Lancashire
|
36
|
+
Greater Manchester | Manchester, Greater
|
37
|
+
Liverpool & Merseyside | Merseyside ## note: added Liverpool & - why? why not?
|
38
|
+
|
39
|
+
|
40
|
+
===================================
|
41
|
+
== North East England
|
42
|
+
|
43
|
+
# Tees
|
44
|
+
# Tyne & Wear
|
45
|
+
|
46
|
+
Durham
|
47
|
+
Newcastle & Northumberland | Northumberland ## note: added Newcastle & - why? why not?
|
48
|
+
Tyne and Wear
|
49
|
+
|
50
|
+
|
51
|
+
=====================================
|
52
|
+
== Yorkshire & Lincolnshire ==
|
53
|
+
|
54
|
+
# Humberside
|
55
|
+
# Leeds & West Yorkshire
|
56
|
+
# Lincolnshire
|
57
|
+
# Sheffield & South Yorkshire
|
58
|
+
# York & North Yorkshire
|
59
|
+
|
60
|
+
Lincolnshire
|
61
|
+
West Yorkshire | Yorkshire, West | Yorkshire (West)
|
62
|
+
South Yorkshire | Yorkshire, South | Yorkshire (South)
|
63
|
+
North Yorkshire | Yorkshire, North | Yorkshire (North)
|
64
|
+
East Yorkshire | Yorkshire, East | Yorkshire (East)
|
65
|
+
|
66
|
+
|
67
|
+
==================================
|
68
|
+
== West Midlands
|
69
|
+
|
70
|
+
# Birmingham & Black Country
|
71
|
+
# Coventry & Warwickshire
|
72
|
+
# Hereford & Worcester
|
73
|
+
# Shropshire
|
74
|
+
# Stoke & Staffordshire
|
75
|
+
|
76
|
+
Herefordshire
|
77
|
+
Shropshire
|
78
|
+
Staffordshire
|
79
|
+
Warwickshire
|
80
|
+
Birmingham & West Midlands | West Midlands ## note: added Birmingham - why? why not?
|
81
|
+
Worcestershire
|
82
|
+
|
83
|
+
|
84
|
+
==================================
|
85
|
+
== East Midlands
|
86
|
+
|
87
|
+
# Derby
|
88
|
+
# Leicester
|
89
|
+
# Northampton
|
90
|
+
# Nottingham
|
91
|
+
|
92
|
+
Derbyshire
|
93
|
+
Leicestershire
|
94
|
+
Northamptonshire
|
95
|
+
Nottinghamshire
|
96
|
+
Rutland
|
97
|
+
|
98
|
+
|
99
|
+
===================================
|
100
|
+
== West & South West
|
101
|
+
|
102
|
+
# Bristol
|
103
|
+
# Cornwall
|
104
|
+
# Devon
|
105
|
+
# Gloucestershire
|
106
|
+
# Somerset
|
107
|
+
# Wiltshire
|
108
|
+
|
109
|
+
|
110
|
+
Cornwall
|
111
|
+
Devon
|
112
|
+
Bristol & Gloucestershire | Gloucestershire
|
113
|
+
Somerset
|
114
|
+
Wiltshire
|
115
|
+
|
116
|
+
|
117
|
+
=================================
|
118
|
+
== East
|
119
|
+
|
120
|
+
# Beds, Herts & Bucks
|
121
|
+
# Cambridgeshire
|
122
|
+
# Essex
|
123
|
+
# Norfolk
|
124
|
+
# Suffolk
|
125
|
+
|
126
|
+
Bedfordshire
|
127
|
+
Hertfordshire
|
128
|
+
Buckinghamshire
|
129
|
+
Cambridgeshire
|
130
|
+
Essex
|
131
|
+
Norfolk
|
132
|
+
Suffolk
|
133
|
+
|
134
|
+
|
135
|
+
===============================
|
136
|
+
== South
|
137
|
+
|
138
|
+
# Berkshire
|
139
|
+
# Dorset
|
140
|
+
# Hampshire & Isle of Wight
|
141
|
+
# Oxford
|
142
|
+
|
143
|
+
Berkshire
|
144
|
+
Dorset
|
145
|
+
Hampshire
|
146
|
+
Oxfordshire
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
===============================
|
151
|
+
== London & South East
|
152
|
+
|
153
|
+
# Kent
|
154
|
+
# London
|
155
|
+
# Surrey
|
156
|
+
# Sussex
|
157
|
+
|
158
|
+
Greater London | London, Greater
|
159
|
+
Kent
|
160
|
+
Surrey
|
161
|
+
East Sussex | Sussex, East | Sussex (East)
|
162
|
+
West Sussex | Sussex, West | Sussex (West)
|
@@ -1,278 +1,278 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module SportDb
|
5
|
-
module Import
|
6
|
-
|
7
|
-
|
8
|
-
class ClubReader
|
9
|
-
|
10
|
-
|
11
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
-
txt = File.open( path, 'r:utf-8' ).read
|
13
|
-
parse( txt )
|
14
|
-
end
|
15
|
-
|
16
|
-
|
17
|
-
def self.parse( txt )
|
18
|
-
recs = []
|
19
|
-
last_rec = nil
|
20
|
-
headings = [] ## headings stack
|
21
|
-
|
22
|
-
txt.each_line do |line|
|
23
|
-
line = line.strip
|
24
|
-
|
25
|
-
next if line.empty?
|
26
|
-
next if line.start_with?( '#' ) ## skip comments too
|
27
|
-
|
28
|
-
## strip inline (until end-of-line) comments too
|
29
|
-
## e.g Eupen => KAS Eupen, ## [de]
|
30
|
-
## => Eupen => KAS Eupen,
|
31
|
-
line = line.sub( /#.*/, '' ).strip
|
32
|
-
pp line
|
33
|
-
|
34
|
-
|
35
|
-
next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
|
36
|
-
|
37
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
38
|
-
## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
|
39
|
-
if line =~ /^(={1,}) ## leading ======
|
40
|
-
([^=]+?) ## text (note: for now no "inline" = allowed)
|
41
|
-
=* ## (optional) trailing ====
|
42
|
-
$/x
|
43
|
-
heading_marker = $1
|
44
|
-
heading_level = $1.length ## count number of = for heading level
|
45
|
-
heading = $2.strip
|
46
|
-
|
47
|
-
puts "heading #{heading_level} >#{heading}<"
|
48
|
-
|
49
|
-
## 1) first pop headings if present
|
50
|
-
while headings.size+1 > heading_level
|
51
|
-
headings.pop
|
52
|
-
end
|
53
|
-
|
54
|
-
## 2) add missing (hierarchy) level if
|
55
|
-
while headings.size+1 < heading_level
|
56
|
-
## todo/fix: issue warning about "skipping" hierarchy level
|
57
|
-
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
58
|
-
headings.push( nil )
|
59
|
-
end
|
60
|
-
|
61
|
-
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
62
|
-
## keep level empty
|
63
|
-
else
|
64
|
-
|
65
|
-
## quick hack: if level is 1 assume country for now
|
66
|
-
## and extract country code e.g.
|
67
|
-
## Austria (at) => at
|
68
|
-
## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
|
69
|
-
if heading_level == 1
|
70
|
-
if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
|
71
|
-
country_code = $1
|
72
|
-
|
73
|
-
## check country code - MUST exist for now!!!!
|
74
|
-
country = SportDb::Import.config.countries[ country_code ]
|
75
|
-
if country.nil?
|
76
|
-
puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
|
77
|
-
exit 1
|
78
|
-
end
|
79
|
-
|
80
|
-
headings.push( country_code )
|
81
|
-
else
|
82
|
-
puts "!!! error - heading level 1 - missing country code - >#{heading}<"
|
83
|
-
exit 1
|
84
|
-
end
|
85
|
-
else
|
86
|
-
## quick hack:
|
87
|
-
## remove known fill/dummy words incl:
|
88
|
-
## Provincia San Juan => San Juan (see argentina, for example)
|
89
|
-
##
|
90
|
-
## use geo tree long term with alternative names - why? why not?
|
91
|
-
words = ['Provincia']
|
92
|
-
words.each { |word| heading = heading.gsub( word, '' ) }
|
93
|
-
heading = heading.strip
|
94
|
-
|
95
|
-
headings.push( heading )
|
96
|
-
end
|
97
|
-
|
98
|
-
## assert that hierarchy level is ok
|
99
|
-
if headings.size != heading_level
|
100
|
-
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
101
|
-
exit 1
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
pp headings
|
106
|
-
|
107
|
-
elsif line.start_with?( '|' )
|
108
|
-
## assume continuation with line of alternative names
|
109
|
-
## note: skip leading pipe
|
110
|
-
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
111
|
-
## strip and squish (white)spaces
|
112
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
113
|
-
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
114
|
-
last_rec.alt_names += values
|
115
|
-
last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
|
116
|
-
|
117
|
-
## check for duplicates
|
118
|
-
if last_rec.duplicates?
|
119
|
-
duplicates = last_rec.duplicates
|
120
|
-
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
121
|
-
pp duplicates
|
122
|
-
pp last_rec
|
123
|
-
##
|
124
|
-
## todo/fix: make it only an error with exit 1
|
125
|
-
## if (not normalized) names are the same (not unique/uniq)
|
126
|
-
## e.g. don't exit on A.F.C. == AFC etc.
|
127
|
-
## exit 1
|
128
|
-
end
|
129
|
-
else
|
130
|
-
values = line.split( ',' )
|
131
|
-
|
132
|
-
rec = Club.new
|
133
|
-
value = values.shift ## get first item
|
134
|
-
## strip and squish (white)spaces
|
135
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
136
|
-
value = value.strip.gsub( /[ \t]+/, ' ' )
|
137
|
-
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
138
|
-
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
139
|
-
|
140
|
-
## note:
|
141
|
-
## check/todo!!!!!!!!!!!!!!!!!-
|
142
|
-
## strip year if to present e.g. (2011-)
|
143
|
-
##
|
144
|
-
## do NOT strip for defunct / historic clubs e.g.
|
145
|
-
## (1899-1910)
|
146
|
-
## or (-1914) or (-2011) etc.
|
147
|
-
|
148
|
-
###
|
149
|
-
## todo: move year out of canonical team name - why? why not?
|
150
|
-
|
151
|
-
## check if canonical name include (2011-) or similar in name
|
152
|
-
## if yes, remove (2011-) and add to (alt) names
|
153
|
-
## e.g. New York FC (2011) => New York FC
|
154
|
-
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
155
|
-
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
156
|
-
|
157
|
-
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
158
|
-
rec.year = $1.to_i
|
159
|
-
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
160
|
-
rec.year_end = $1.to_i
|
161
|
-
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
162
|
-
rec.year = $1.to_i
|
163
|
-
rec.year_end = $2.to_i
|
164
|
-
else
|
165
|
-
## todo/check: warn about unknown year format
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
## todo/check - check for unknown format values
|
170
|
-
## e.g. too many values, duplicate years, etc.
|
171
|
-
## check for overwritting, etc.
|
172
|
-
while values.size > 0
|
173
|
-
value = values.shift
|
174
|
-
## strip and squish (white)spaces
|
175
|
-
# e.g. León › Guanajuato => León › Guanajuato
|
176
|
-
value = value.strip.gsub( /[ \t]+/, ' ' )
|
177
|
-
if value =~/^\d{4}$/ # e.g 1904
|
178
|
-
## todo/check: issue warning if year is already set!!!!!!!
|
179
|
-
if rec.year
|
180
|
-
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
181
|
-
pp rec
|
182
|
-
exit 1
|
183
|
-
end
|
184
|
-
rec.year = value.to_i
|
185
|
-
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
186
|
-
## cut-off leading @ and spaces
|
187
|
-
rec.ground = value[1..-1].strip
|
188
|
-
else
|
189
|
-
## assume city / geo tree
|
190
|
-
## split into geo tree
|
191
|
-
geos = split_geo( value )
|
192
|
-
city = geos[0]
|
193
|
-
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
194
|
-
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
195
|
-
rec.district = $1.strip
|
196
|
-
city = city.gsub( /\(.+?\)/, '' ).strip
|
197
|
-
end
|
198
|
-
rec.city = city
|
199
|
-
|
200
|
-
if geos.size > 1
|
201
|
-
## cut-off city and keep the rest (of geo tree)
|
202
|
-
rec.geos = geos[1..-1]
|
203
|
-
end
|
204
|
-
end
|
205
|
-
end ## while values
|
206
|
-
|
207
|
-
|
208
|
-
###############
|
209
|
-
## use headings text for geo tree
|
210
|
-
|
211
|
-
## 1) add country if present
|
212
|
-
if headings.size > 0 && headings[0]
|
213
|
-
country = SportDb::Import.config.countries[ headings[0] ]
|
214
|
-
rec.country = country
|
215
|
-
else
|
216
|
-
## make it an error - why? why not?
|
217
|
-
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
218
|
-
exit 1
|
219
|
-
end
|
220
|
-
|
221
|
-
## 2) check geo tree with headings hierarchy
|
222
|
-
if headings.size > 1 && headings[1]
|
223
|
-
geos = split_geo( headings[1] )
|
224
|
-
if rec.geos
|
225
|
-
if rec.geos[0] != geos[0]
|
226
|
-
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
227
|
-
exit 1
|
228
|
-
end
|
229
|
-
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
230
|
-
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
231
|
-
exit 1
|
232
|
-
end
|
233
|
-
else
|
234
|
-
## add missing region (state/province) from headings hierarchy
|
235
|
-
rec.geos = geos
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
last_rec = rec
|
240
|
-
|
241
|
-
|
242
|
-
### todo/fix:
|
243
|
-
## auto-add alt name with dots stripped - why? why not?
|
244
|
-
## e.g. D.C. United => DC United
|
245
|
-
## e.g. Liverpool F.C. => Liverpool FC
|
246
|
-
## e.g. St. Albin => St Albin etc.
|
247
|
-
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
248
|
-
|
249
|
-
##
|
250
|
-
## todo/fix: unify mapping entries
|
251
|
-
## always lowercase !!!! (case insensitive)
|
252
|
-
## always strip (2011-) !!!
|
253
|
-
## always strip dots (e.g. St., F.C, etc.)
|
254
|
-
|
255
|
-
recs << rec
|
256
|
-
end
|
257
|
-
end # each_line
|
258
|
-
recs
|
259
|
-
end # method read
|
260
|
-
|
261
|
-
### helpers
|
262
|
-
def self.split_geo( str )
|
263
|
-
## assume city / geo tree
|
264
|
-
## strip and squish (white)spaces
|
265
|
-
# e.g. León › Guanajuato => León › Guanajuato
|
266
|
-
str = str.strip.gsub( /[ \t]+/, ' ' )
|
267
|
-
|
268
|
-
## split into geo tree
|
269
|
-
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
270
|
-
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
271
|
-
geos
|
272
|
-
end
|
273
|
-
|
274
|
-
end # class ClubReader
|
275
|
-
|
276
|
-
|
277
|
-
end ## module Import
|
278
|
-
end ## module SportDb
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class ClubReader
|
9
|
+
|
10
|
+
|
11
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
+
txt = File.open( path, 'r:utf-8' ).read
|
13
|
+
parse( txt )
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def self.parse( txt )
|
18
|
+
recs = []
|
19
|
+
last_rec = nil
|
20
|
+
headings = [] ## headings stack
|
21
|
+
|
22
|
+
txt.each_line do |line|
|
23
|
+
line = line.strip
|
24
|
+
|
25
|
+
next if line.empty?
|
26
|
+
next if line.start_with?( '#' ) ## skip comments too
|
27
|
+
|
28
|
+
## strip inline (until end-of-line) comments too
|
29
|
+
## e.g Eupen => KAS Eupen, ## [de]
|
30
|
+
## => Eupen => KAS Eupen,
|
31
|
+
line = line.sub( /#.*/, '' ).strip
|
32
|
+
pp line
|
33
|
+
|
34
|
+
|
35
|
+
next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
|
36
|
+
|
37
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
38
|
+
## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
|
39
|
+
if line =~ /^(={1,}) ## leading ======
|
40
|
+
([^=]+?) ## text (note: for now no "inline" = allowed)
|
41
|
+
=* ## (optional) trailing ====
|
42
|
+
$/x
|
43
|
+
heading_marker = $1
|
44
|
+
heading_level = $1.length ## count number of = for heading level
|
45
|
+
heading = $2.strip
|
46
|
+
|
47
|
+
puts "heading #{heading_level} >#{heading}<"
|
48
|
+
|
49
|
+
## 1) first pop headings if present
|
50
|
+
while headings.size+1 > heading_level
|
51
|
+
headings.pop
|
52
|
+
end
|
53
|
+
|
54
|
+
## 2) add missing (hierarchy) level if
|
55
|
+
while headings.size+1 < heading_level
|
56
|
+
## todo/fix: issue warning about "skipping" hierarchy level
|
57
|
+
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
58
|
+
headings.push( nil )
|
59
|
+
end
|
60
|
+
|
61
|
+
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
62
|
+
## keep level empty
|
63
|
+
else
|
64
|
+
|
65
|
+
## quick hack: if level is 1 assume country for now
|
66
|
+
## and extract country code e.g.
|
67
|
+
## Austria (at) => at
|
68
|
+
## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
|
69
|
+
if heading_level == 1
|
70
|
+
if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
|
71
|
+
country_code = $1
|
72
|
+
|
73
|
+
## check country code - MUST exist for now!!!!
|
74
|
+
country = SportDb::Import.config.countries[ country_code ]
|
75
|
+
if country.nil?
|
76
|
+
puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
|
77
|
+
exit 1
|
78
|
+
end
|
79
|
+
|
80
|
+
headings.push( country_code )
|
81
|
+
else
|
82
|
+
puts "!!! error - heading level 1 - missing country code - >#{heading}<"
|
83
|
+
exit 1
|
84
|
+
end
|
85
|
+
else
|
86
|
+
## quick hack:
|
87
|
+
## remove known fill/dummy words incl:
|
88
|
+
## Provincia San Juan => San Juan (see argentina, for example)
|
89
|
+
##
|
90
|
+
## use geo tree long term with alternative names - why? why not?
|
91
|
+
words = ['Provincia']
|
92
|
+
words.each { |word| heading = heading.gsub( word, '' ) }
|
93
|
+
heading = heading.strip
|
94
|
+
|
95
|
+
headings.push( heading )
|
96
|
+
end
|
97
|
+
|
98
|
+
## assert that hierarchy level is ok
|
99
|
+
if headings.size != heading_level
|
100
|
+
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
101
|
+
exit 1
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
pp headings
|
106
|
+
|
107
|
+
elsif line.start_with?( '|' )
|
108
|
+
## assume continuation with line of alternative names
|
109
|
+
## note: skip leading pipe
|
110
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
111
|
+
## strip and squish (white)spaces
|
112
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
113
|
+
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
114
|
+
last_rec.alt_names += values
|
115
|
+
last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
|
116
|
+
|
117
|
+
## check for duplicates
|
118
|
+
if last_rec.duplicates?
|
119
|
+
duplicates = last_rec.duplicates
|
120
|
+
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
121
|
+
pp duplicates
|
122
|
+
pp last_rec
|
123
|
+
##
|
124
|
+
## todo/fix: make it only an error with exit 1
|
125
|
+
## if (not normalized) names are the same (not unique/uniq)
|
126
|
+
## e.g. don't exit on A.F.C. == AFC etc.
|
127
|
+
## exit 1
|
128
|
+
end
|
129
|
+
else
|
130
|
+
values = line.split( ',' )
|
131
|
+
|
132
|
+
rec = Club.new
|
133
|
+
value = values.shift ## get first item
|
134
|
+
## strip and squish (white)spaces
|
135
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
136
|
+
value = value.strip.gsub( /[ \t]+/, ' ' )
|
137
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
138
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
139
|
+
|
140
|
+
## note:
|
141
|
+
## check/todo!!!!!!!!!!!!!!!!!-
|
142
|
+
## strip year if to present e.g. (2011-)
|
143
|
+
##
|
144
|
+
## do NOT strip for defunct / historic clubs e.g.
|
145
|
+
## (1899-1910)
|
146
|
+
## or (-1914) or (-2011) etc.
|
147
|
+
|
148
|
+
###
|
149
|
+
## todo: move year out of canonical team name - why? why not?
|
150
|
+
|
151
|
+
## check if canonical name include (2011-) or similar in name
|
152
|
+
## if yes, remove (2011-) and add to (alt) names
|
153
|
+
## e.g. New York FC (2011) => New York FC
|
154
|
+
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
155
|
+
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
156
|
+
|
157
|
+
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
158
|
+
rec.year = $1.to_i
|
159
|
+
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
160
|
+
rec.year_end = $1.to_i
|
161
|
+
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
162
|
+
rec.year = $1.to_i
|
163
|
+
rec.year_end = $2.to_i
|
164
|
+
else
|
165
|
+
## todo/check: warn about unknown year format
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
## todo/check - check for unknown format values
|
170
|
+
## e.g. too many values, duplicate years, etc.
|
171
|
+
## check for overwritting, etc.
|
172
|
+
while values.size > 0
|
173
|
+
value = values.shift
|
174
|
+
## strip and squish (white)spaces
|
175
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
176
|
+
value = value.strip.gsub( /[ \t]+/, ' ' )
|
177
|
+
if value =~/^\d{4}$/ # e.g 1904
|
178
|
+
## todo/check: issue warning if year is already set!!!!!!!
|
179
|
+
if rec.year
|
180
|
+
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
181
|
+
pp rec
|
182
|
+
exit 1
|
183
|
+
end
|
184
|
+
rec.year = value.to_i
|
185
|
+
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
186
|
+
## cut-off leading @ and spaces
|
187
|
+
rec.ground = value[1..-1].strip
|
188
|
+
else
|
189
|
+
## assume city / geo tree
|
190
|
+
## split into geo tree
|
191
|
+
geos = split_geo( value )
|
192
|
+
city = geos[0]
|
193
|
+
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
194
|
+
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
195
|
+
rec.district = $1.strip
|
196
|
+
city = city.gsub( /\(.+?\)/, '' ).strip
|
197
|
+
end
|
198
|
+
rec.city = city
|
199
|
+
|
200
|
+
if geos.size > 1
|
201
|
+
## cut-off city and keep the rest (of geo tree)
|
202
|
+
rec.geos = geos[1..-1]
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end ## while values
|
206
|
+
|
207
|
+
|
208
|
+
###############
|
209
|
+
## use headings text for geo tree
|
210
|
+
|
211
|
+
## 1) add country if present
|
212
|
+
if headings.size > 0 && headings[0]
|
213
|
+
country = SportDb::Import.config.countries[ headings[0] ]
|
214
|
+
rec.country = country
|
215
|
+
else
|
216
|
+
## make it an error - why? why not?
|
217
|
+
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
218
|
+
exit 1
|
219
|
+
end
|
220
|
+
|
221
|
+
## 2) check geo tree with headings hierarchy
|
222
|
+
if headings.size > 1 && headings[1]
|
223
|
+
geos = split_geo( headings[1] )
|
224
|
+
if rec.geos
|
225
|
+
if rec.geos[0] != geos[0]
|
226
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
227
|
+
exit 1
|
228
|
+
end
|
229
|
+
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
230
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
231
|
+
exit 1
|
232
|
+
end
|
233
|
+
else
|
234
|
+
## add missing region (state/province) from headings hierarchy
|
235
|
+
rec.geos = geos
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
last_rec = rec
|
240
|
+
|
241
|
+
|
242
|
+
### todo/fix:
|
243
|
+
## auto-add alt name with dots stripped - why? why not?
|
244
|
+
## e.g. D.C. United => DC United
|
245
|
+
## e.g. Liverpool F.C. => Liverpool FC
|
246
|
+
## e.g. St. Albin => St Albin etc.
|
247
|
+
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
248
|
+
|
249
|
+
##
|
250
|
+
## todo/fix: unify mapping entries
|
251
|
+
## always lowercase !!!! (case insensitive)
|
252
|
+
## always strip (2011-) !!!
|
253
|
+
## always strip dots (e.g. St., F.C, etc.)
|
254
|
+
|
255
|
+
recs << rec
|
256
|
+
end
|
257
|
+
end # each_line
|
258
|
+
recs
|
259
|
+
end # method read
|
260
|
+
|
261
|
+
### helpers
|
262
|
+
def self.split_geo( str )
|
263
|
+
## assume city / geo tree
|
264
|
+
## strip and squish (white)spaces
|
265
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
266
|
+
str = str.strip.gsub( /[ \t]+/, ' ' )
|
267
|
+
|
268
|
+
## split into geo tree
|
269
|
+
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
270
|
+
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
271
|
+
geos
|
272
|
+
end
|
273
|
+
|
274
|
+
end # class ClubReader
|
275
|
+
|
276
|
+
|
277
|
+
end ## module Import
|
278
|
+
end ## module SportDb
|