sportdb-config 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 064b7f927038794c7d1efecde27bbba3d7ac78fa
4
- data.tar.gz: 0dc83a80b70db77bca25e9921182fc4a1aea0466
3
+ metadata.gz: 5cc9b8eea5116ab13e92c7a5a6b3b7962fa3f1c1
4
+ data.tar.gz: a94f4452d3398743f2e6283cead72ccc552d2ec0
5
5
  SHA512:
6
- metadata.gz: 75d1416b8976f6bd619de196265a7ba1ac3f88c020b28489a4302732272f0e1b686f76a2dcf877e40364e240d148bf0853ba643466ff86d454060e45c9dd4da3
7
- data.tar.gz: 1fdc13cde4020f2bc2a21da04b7df6ce25810f4f282a477862adf2af789b17d6fa009e08163fc163579478aed0ca9210588aafb09f728d2472a9711bdec90c1a
6
+ metadata.gz: 6db0746eda38483b0bfcc241730044805c9332a0f2b27ad2aa5b96586371121579c8ef75a1903d482a7c1f68534a4231877a24fedaf87a866399e9f2105439f5
7
+ data.tar.gz: c5803e3626a36b8949d6b774e2b43748c27fa9c819a7ac9984c633f613f8b81ad1c9a026c0bafc3f1c66023d75b937675d6196ea01167b47f89ece4c14dde6fe
@@ -1,9 +1,9 @@
1
-
2
-
3
- 1 => ligue1, French Ligue 1
4
- 2 => ligue2, French Ligue 2
5
-
6
-
7
- [2001-02] ## until (including) 2001-02 season
8
- 1 => division1, ? ## use championat or something? check official name
9
- 2 => division2, ?
1
+
2
+
3
+ 1 => ligue1, French Ligue 1
4
+ 2 => ligue2, French Ligue 2
5
+
6
+
7
+ [2001-02] ## until (including) 2001-02 season
8
+ 1 => division1, ? ## use championat or something? check official name
9
+ 2 => division2, ?
@@ -1,7 +1,7 @@
1
-
2
-
3
- 1 => superleague, Greek Superleague
4
-
5
-
6
- [2005-06] ## until (including) 2005-06 season
7
- 1 => alphaethniki, Greek Alpha Ethniki
1
+
2
+
3
+ 1 => superleague, Greek Superleague
4
+
5
+
6
+ [2005-06] ## until (including) 2005-06 season
7
+ 1 => alphaethniki, Greek Alpha Ethniki
@@ -1,19 +1,19 @@
1
-
2
- 1 => premiership, Scotish Premiership # starting w/ 2013-14 season
3
- 2 => championship, Scotish Championship
4
- 3 => league1, Scotish League One
5
- 4 => league2, Scotish League Two
6
-
7
-
8
- [2012-13] ## until (including) 2012-13 season
9
- 1 => premierleague, Scotish Premiership
10
- 2 => division1, Scotish 1st Division
11
- 3 => division2, Scotish League One
12
- 4 => division3, Scotish League Two
13
-
14
-
15
- [1997-98] ## until (including) season
16
- 1 => premierdivision, Scotish Premier Division
17
- 2 => division1, Scotish 1st Division
18
- 3 => division2, Scotish 2nd Division
19
- 4 => division3, Scotish 3rd Division
1
+
2
+ 1 => premiership, Scotish Premiership # starting w/ 2013-14 season
3
+ 2 => championship, Scotish Championship
4
+ 3 => league1, Scotish League One
5
+ 4 => league2, Scotish League Two
6
+
7
+
8
+ [2012-13] ## until (including) 2012-13 season
9
+ 1 => premierleague, Scotish Premiership
10
+ 2 => division1, Scotish 1st Division
11
+ 3 => division2, Scotish League One
12
+ 4 => division3, Scotish League Two
13
+
14
+
15
+ [1997-98] ## until (including) season
16
+ 1 => premierdivision, Scotish Premier Division
17
+ 2 => division1, Scotish 1st Division
18
+ 3 => division2, Scotish 2nd Division
19
+ 4 => division3, Scotish 3rd Division
@@ -1,162 +1,162 @@
1
- ##
2
- # what name? use regions or maps or geos or zones or __?
3
-
4
- #
5
- # england
6
- # see https://en.wikipedia.org/wiki/Subdivisions_of_England
7
- # see https://www.bbc.co.uk/news/england/regions
8
-
9
- ##
10
- # todo: for sort order - allow different sort name
11
- # e.g. East Sussex => Sussex (East) or Sussex, East or something
12
- # use <> for marking what counts for sorting
13
- # e.g East <Sussex> => get auto-converted to Sussex, East - why? why not?
14
- #
15
- # more examples:
16
- # North ‹Yorkshire› => Yorkshire (North) or Yorkshire, North
17
- # Greater ‹London› => London, Greater
18
- # Greater ‹Manchaster› => Manchaster, Greater
19
-
20
-
21
- ##
22
- # note: uses all regions following the camra good beer guide (book)
23
- #
24
-
25
- ======================================
26
- == North West England ==
27
-
28
- # Cumbria
29
- # Lancashire
30
- # Liverpool
31
- # Manchester
32
-
33
- Cheshire
34
- Cumbria
35
- Lancashire
36
- Greater Manchester | Manchester, Greater
37
- Liverpool & Merseyside | Merseyside ## note: added Liverpool & - why? why not?
38
-
39
-
40
- ===================================
41
- == North East England
42
-
43
- # Tees
44
- # Tyne & Wear
45
-
46
- Durham
47
- Newcastle & Northumberland | Northumberland ## note: added Newcastle & - why? why not?
48
- Tyne and Wear
49
-
50
-
51
- =====================================
52
- == Yorkshire & Lincolnshire ==
53
-
54
- # Humberside
55
- # Leeds & West Yorkshire
56
- # Lincolnshire
57
- # Sheffield & South Yorkshire
58
- # York & North Yorkshire
59
-
60
- Lincolnshire
61
- West Yorkshire | Yorkshire, West | Yorkshire (West)
62
- South Yorkshire | Yorkshire, South | Yorkshire (South)
63
- North Yorkshire | Yorkshire, North | Yorkshire (North)
64
- East Yorkshire | Yorkshire, East | Yorkshire (East)
65
-
66
-
67
- ==================================
68
- == West Midlands
69
-
70
- # Birmingham & Black Country
71
- # Coventry & Warwickshire
72
- # Hereford & Worcester
73
- # Shropshire
74
- # Stoke & Staffordshire
75
-
76
- Herefordshire
77
- Shropshire
78
- Staffordshire
79
- Warwickshire
80
- Birmingham & West Midlands | West Midlands ## note: added Birmingham - why? why not?
81
- Worcestershire
82
-
83
-
84
- ==================================
85
- == East Midlands
86
-
87
- # Derby
88
- # Leicester
89
- # Northampton
90
- # Nottingham
91
-
92
- Derbyshire
93
- Leicestershire
94
- Northamptonshire
95
- Nottinghamshire
96
- Rutland
97
-
98
-
99
- ===================================
100
- == West & South West
101
-
102
- # Bristol
103
- # Cornwall
104
- # Devon
105
- # Gloucestershire
106
- # Somerset
107
- # Wiltshire
108
-
109
-
110
- Cornwall
111
- Devon
112
- Bristol & Gloucestershire | Gloucestershire
113
- Somerset
114
- Wiltshire
115
-
116
-
117
- =================================
118
- == East
119
-
120
- # Beds, Herts & Bucks
121
- # Cambridgeshire
122
- # Essex
123
- # Norfolk
124
- # Suffolk
125
-
126
- Bedfordshire
127
- Hertfordshire
128
- Buckinghamshire
129
- Cambridgeshire
130
- Essex
131
- Norfolk
132
- Suffolk
133
-
134
-
135
- ===============================
136
- == South
137
-
138
- # Berkshire
139
- # Dorset
140
- # Hampshire & Isle of Wight
141
- # Oxford
142
-
143
- Berkshire
144
- Dorset
145
- Hampshire
146
- Oxfordshire
147
-
148
-
149
-
150
- ===============================
151
- == London & South East
152
-
153
- # Kent
154
- # London
155
- # Surrey
156
- # Sussex
157
-
158
- Greater London | London, Greater
159
- Kent
160
- Surrey
161
- East Sussex | Sussex, East | Sussex (East)
162
- West Sussex | Sussex, West | Sussex (West)
1
+ ##
2
+ # what name? use regions or maps or geos or zones or __?
3
+
4
+ #
5
+ # england
6
+ # see https://en.wikipedia.org/wiki/Subdivisions_of_England
7
+ # see https://www.bbc.co.uk/news/england/regions
8
+
9
+ ##
10
+ # todo: for sort order - allow different sort name
11
+ # e.g. East Sussex => Sussex (East) or Sussex, East or something
12
+ # use <> for marking what counts for sorting
13
+ # e.g East <Sussex> => get auto-converted to Sussex, East - why? why not?
14
+ #
15
+ # more examples:
16
+ # North ‹Yorkshire› => Yorkshire (North) or Yorkshire, North
17
+ # Greater ‹London› => London, Greater
18
+ # Greater ‹Manchaster› => Manchaster, Greater
19
+
20
+
21
+ ##
22
+ # note: uses all regions following the camra good beer guide (book)
23
+ #
24
+
25
+ ======================================
26
+ == North West England ==
27
+
28
+ # Cumbria
29
+ # Lancashire
30
+ # Liverpool
31
+ # Manchester
32
+
33
+ Cheshire
34
+ Cumbria
35
+ Lancashire
36
+ Greater Manchester | Manchester, Greater
37
+ Liverpool & Merseyside | Merseyside ## note: added Liverpool & - why? why not?
38
+
39
+
40
+ ===================================
41
+ == North East England
42
+
43
+ # Tees
44
+ # Tyne & Wear
45
+
46
+ Durham
47
+ Newcastle & Northumberland | Northumberland ## note: added Newcastle & - why? why not?
48
+ Tyne and Wear
49
+
50
+
51
+ =====================================
52
+ == Yorkshire & Lincolnshire ==
53
+
54
+ # Humberside
55
+ # Leeds & West Yorkshire
56
+ # Lincolnshire
57
+ # Sheffield & South Yorkshire
58
+ # York & North Yorkshire
59
+
60
+ Lincolnshire
61
+ West Yorkshire | Yorkshire, West | Yorkshire (West)
62
+ South Yorkshire | Yorkshire, South | Yorkshire (South)
63
+ North Yorkshire | Yorkshire, North | Yorkshire (North)
64
+ East Yorkshire | Yorkshire, East | Yorkshire (East)
65
+
66
+
67
+ ==================================
68
+ == West Midlands
69
+
70
+ # Birmingham & Black Country
71
+ # Coventry & Warwickshire
72
+ # Hereford & Worcester
73
+ # Shropshire
74
+ # Stoke & Staffordshire
75
+
76
+ Herefordshire
77
+ Shropshire
78
+ Staffordshire
79
+ Warwickshire
80
+ Birmingham & West Midlands | West Midlands ## note: added Birmingham - why? why not?
81
+ Worcestershire
82
+
83
+
84
+ ==================================
85
+ == East Midlands
86
+
87
+ # Derby
88
+ # Leicester
89
+ # Northampton
90
+ # Nottingham
91
+
92
+ Derbyshire
93
+ Leicestershire
94
+ Northamptonshire
95
+ Nottinghamshire
96
+ Rutland
97
+
98
+
99
+ ===================================
100
+ == West & South West
101
+
102
+ # Bristol
103
+ # Cornwall
104
+ # Devon
105
+ # Gloucestershire
106
+ # Somerset
107
+ # Wiltshire
108
+
109
+
110
+ Cornwall
111
+ Devon
112
+ Bristol & Gloucestershire | Gloucestershire
113
+ Somerset
114
+ Wiltshire
115
+
116
+
117
+ =================================
118
+ == East
119
+
120
+ # Beds, Herts & Bucks
121
+ # Cambridgeshire
122
+ # Essex
123
+ # Norfolk
124
+ # Suffolk
125
+
126
+ Bedfordshire
127
+ Hertfordshire
128
+ Buckinghamshire
129
+ Cambridgeshire
130
+ Essex
131
+ Norfolk
132
+ Suffolk
133
+
134
+
135
+ ===============================
136
+ == South
137
+
138
+ # Berkshire
139
+ # Dorset
140
+ # Hampshire & Isle of Wight
141
+ # Oxford
142
+
143
+ Berkshire
144
+ Dorset
145
+ Hampshire
146
+ Oxfordshire
147
+
148
+
149
+
150
+ ===============================
151
+ == London & South East
152
+
153
+ # Kent
154
+ # London
155
+ # Surrey
156
+ # Sussex
157
+
158
+ Greater London | London, Greater
159
+ Kent
160
+ Surrey
161
+ East Sussex | Sussex, East | Sussex (East)
162
+ West Sussex | Sussex, West | Sussex (West)
@@ -1,278 +1,278 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ).read
13
- parse( txt )
14
- end
15
-
16
-
17
- def self.parse( txt )
18
- recs = []
19
- last_rec = nil
20
- headings = [] ## headings stack
21
-
22
- txt.each_line do |line|
23
- line = line.strip
24
-
25
- next if line.empty?
26
- next if line.start_with?( '#' ) ## skip comments too
27
-
28
- ## strip inline (until end-of-line) comments too
29
- ## e.g Eupen => KAS Eupen, ## [de]
30
- ## => Eupen => KAS Eupen,
31
- line = line.sub( /#.*/, '' ).strip
32
- pp line
33
-
34
-
35
- next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
-
37
- ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
- ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
- if line =~ /^(={1,}) ## leading ======
40
- ([^=]+?) ## text (note: for now no "inline" = allowed)
41
- =* ## (optional) trailing ====
42
- $/x
43
- heading_marker = $1
44
- heading_level = $1.length ## count number of = for heading level
45
- heading = $2.strip
46
-
47
- puts "heading #{heading_level} >#{heading}<"
48
-
49
- ## 1) first pop headings if present
50
- while headings.size+1 > heading_level
51
- headings.pop
52
- end
53
-
54
- ## 2) add missing (hierarchy) level if
55
- while headings.size+1 < heading_level
56
- ## todo/fix: issue warning about "skipping" hierarchy level
57
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
- headings.push( nil )
59
- end
60
-
61
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
- ## keep level empty
63
- else
64
-
65
- ## quick hack: if level is 1 assume country for now
66
- ## and extract country code e.g.
67
- ## Austria (at) => at
68
- ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
- if heading_level == 1
70
- if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
- country_code = $1
72
-
73
- ## check country code - MUST exist for now!!!!
74
- country = SportDb::Import.config.countries[ country_code ]
75
- if country.nil?
76
- puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
- exit 1
78
- end
79
-
80
- headings.push( country_code )
81
- else
82
- puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
- exit 1
84
- end
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif line.start_with?( '|' )
108
- ## assume continuation with line of alternative names
109
- ## note: skip leading pipe
110
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
- ## strip and squish (white)spaces
112
- # e.g. New York FC (2011-) => New York FC (2011-)
113
- values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
- last_rec.alt_names += values
115
- last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
-
117
- ## check for duplicates
118
- if last_rec.duplicates?
119
- duplicates = last_rec.duplicates
120
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
- pp duplicates
122
- pp last_rec
123
- ##
124
- ## todo/fix: make it only an error with exit 1
125
- ## if (not normalized) names are the same (not unique/uniq)
126
- ## e.g. don't exit on A.F.C. == AFC etc.
127
- ## exit 1
128
- end
129
- else
130
- values = line.split( ',' )
131
-
132
- rec = Club.new
133
- value = values.shift ## get first item
134
- ## strip and squish (white)spaces
135
- # e.g. New York FC (2011-) => New York FC (2011-)
136
- value = value.strip.gsub( /[ \t]+/, ' ' )
137
- rec.name = value # canoncial name (global unique "beautiful/long" name)
138
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
-
140
- ## note:
141
- ## check/todo!!!!!!!!!!!!!!!!!-
142
- ## strip year if to present e.g. (2011-)
143
- ##
144
- ## do NOT strip for defunct / historic clubs e.g.
145
- ## (1899-1910)
146
- ## or (-1914) or (-2011) etc.
147
-
148
- ###
149
- ## todo: move year out of canonical team name - why? why not?
150
-
151
- ## check if canonical name include (2011-) or similar in name
152
- ## if yes, remove (2011-) and add to (alt) names
153
- ## e.g. New York FC (2011) => New York FC
154
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
-
157
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
- rec.year = $1.to_i
159
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
- rec.year_end = $1.to_i
161
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
- rec.year = $1.to_i
163
- rec.year_end = $2.to_i
164
- else
165
- ## todo/check: warn about unknown year format
166
- end
167
- end
168
-
169
- ## todo/check - check for unknown format values
170
- ## e.g. too many values, duplicate years, etc.
171
- ## check for overwritting, etc.
172
- while values.size > 0
173
- value = values.shift
174
- ## strip and squish (white)spaces
175
- # e.g. León › Guanajuato => León › Guanajuato
176
- value = value.strip.gsub( /[ \t]+/, ' ' )
177
- if value =~/^\d{4}$/ # e.g 1904
178
- ## todo/check: issue warning if year is already set!!!!!!!
179
- if rec.year
180
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
- pp rec
182
- exit 1
183
- end
184
- rec.year = value.to_i
185
- elsif value.start_with?( '@' ) # e.g. @ Anfield
186
- ## cut-off leading @ and spaces
187
- rec.ground = value[1..-1].strip
188
- else
189
- ## assume city / geo tree
190
- ## split into geo tree
191
- geos = split_geo( value )
192
- city = geos[0]
193
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
- rec.district = $1.strip
196
- city = city.gsub( /\(.+?\)/, '' ).strip
197
- end
198
- rec.city = city
199
-
200
- if geos.size > 1
201
- ## cut-off city and keep the rest (of geo tree)
202
- rec.geos = geos[1..-1]
203
- end
204
- end
205
- end ## while values
206
-
207
-
208
- ###############
209
- ## use headings text for geo tree
210
-
211
- ## 1) add country if present
212
- if headings.size > 0 && headings[0]
213
- country = SportDb::Import.config.countries[ headings[0] ]
214
- rec.country = country
215
- else
216
- ## make it an error - why? why not?
217
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
- exit 1
219
- end
220
-
221
- ## 2) check geo tree with headings hierarchy
222
- if headings.size > 1 && headings[1]
223
- geos = split_geo( headings[1] )
224
- if rec.geos
225
- if rec.geos[0] != geos[0]
226
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
- exit 1
228
- end
229
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
- exit 1
232
- end
233
- else
234
- ## add missing region (state/province) from headings hierarchy
235
- rec.geos = geos
236
- end
237
- end
238
-
239
- last_rec = rec
240
-
241
-
242
- ### todo/fix:
243
- ## auto-add alt name with dots stripped - why? why not?
244
- ## e.g. D.C. United => DC United
245
- ## e.g. Liverpool F.C. => Liverpool FC
246
- ## e.g. St. Albin => St Albin etc.
247
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
-
249
- ##
250
- ## todo/fix: unify mapping entries
251
- ## always lowercase !!!! (case insensitive)
252
- ## always strip (2011-) !!!
253
- ## always strip dots (e.g. St., F.C, etc.)
254
-
255
- recs << rec
256
- end
257
- end # each_line
258
- recs
259
- end # method read
260
-
261
- ### helpers
262
- def self.split_geo( str )
263
- ## assume city / geo tree
264
- ## strip and squish (white)spaces
265
- # e.g. León › Guanajuato => León › Guanajuato
266
- str = str.strip.gsub( /[ \t]+/, ' ' )
267
-
268
- ## split into geo tree
269
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
- geos
272
- end
273
-
274
- end # class ClubReader
275
-
276
-
277
- end ## module Import
278
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+
11
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
+ txt = File.open( path, 'r:utf-8' ).read
13
+ parse( txt )
14
+ end
15
+
16
+
17
+ def self.parse( txt )
18
+ recs = []
19
+ last_rec = nil
20
+ headings = [] ## headings stack
21
+
22
+ txt.each_line do |line|
23
+ line = line.strip
24
+
25
+ next if line.empty?
26
+ next if line.start_with?( '#' ) ## skip comments too
27
+
28
+ ## strip inline (until end-of-line) comments too
29
+ ## e.g Eupen => KAS Eupen, ## [de]
30
+ ## => Eupen => KAS Eupen,
31
+ line = line.sub( /#.*/, '' ).strip
32
+ pp line
33
+
34
+
35
+ next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
+
37
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
+ ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
+ if line =~ /^(={1,}) ## leading ======
40
+ ([^=]+?) ## text (note: for now no "inline" = allowed)
41
+ =* ## (optional) trailing ====
42
+ $/x
43
+ heading_marker = $1
44
+ heading_level = $1.length ## count number of = for heading level
45
+ heading = $2.strip
46
+
47
+ puts "heading #{heading_level} >#{heading}<"
48
+
49
+ ## 1) first pop headings if present
50
+ while headings.size+1 > heading_level
51
+ headings.pop
52
+ end
53
+
54
+ ## 2) add missing (hierarchy) level if
55
+ while headings.size+1 < heading_level
56
+ ## todo/fix: issue warning about "skipping" hierarchy level
57
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
+ headings.push( nil )
59
+ end
60
+
61
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
+ ## keep level empty
63
+ else
64
+
65
+ ## quick hack: if level is 1 assume country for now
66
+ ## and extract country code e.g.
67
+ ## Austria (at) => at
68
+ ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
+ if heading_level == 1
70
+ if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
+ country_code = $1
72
+
73
+ ## check country code - MUST exist for now!!!!
74
+ country = SportDb::Import.config.countries[ country_code ]
75
+ if country.nil?
76
+ puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
+ exit 1
78
+ end
79
+
80
+ headings.push( country_code )
81
+ else
82
+ puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
+ exit 1
84
+ end
85
+ else
86
+ ## quick hack:
87
+ ## remove known fill/dummy words incl:
88
+ ## Provincia San Juan => San Juan (see argentina, for example)
89
+ ##
90
+ ## use geo tree long term with alternative names - why? why not?
91
+ words = ['Provincia']
92
+ words.each { |word| heading = heading.gsub( word, '' ) }
93
+ heading = heading.strip
94
+
95
+ headings.push( heading )
96
+ end
97
+
98
+ ## assert that hierarchy level is ok
99
+ if headings.size != heading_level
100
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
+ exit 1
102
+ end
103
+ end
104
+
105
+ pp headings
106
+
107
+ elsif line.start_with?( '|' )
108
+ ## assume continuation with line of alternative names
109
+ ## note: skip leading pipe
110
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
+ ## strip and squish (white)spaces
112
+ # e.g. New York FC (2011-) => New York FC (2011-)
113
+ values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
+ last_rec.alt_names += values
115
+ last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
+
117
+ ## check for duplicates
118
+ if last_rec.duplicates?
119
+ duplicates = last_rec.duplicates
120
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
+ pp duplicates
122
+ pp last_rec
123
+ ##
124
+ ## todo/fix: make it only an error with exit 1
125
+ ## if (not normalized) names are the same (not unique/uniq)
126
+ ## e.g. don't exit on A.F.C. == AFC etc.
127
+ ## exit 1
128
+ end
129
+ else
130
+ values = line.split( ',' )
131
+
132
+ rec = Club.new
133
+ value = values.shift ## get first item
134
+ ## strip and squish (white)spaces
135
+ # e.g. New York FC (2011-) => New York FC (2011-)
136
+ value = value.strip.gsub( /[ \t]+/, ' ' )
137
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
138
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
+
140
+ ## note:
141
+ ## check/todo!!!!!!!!!!!!!!!!!-
142
+ ## strip year if to present e.g. (2011-)
143
+ ##
144
+ ## do NOT strip for defunct / historic clubs e.g.
145
+ ## (1899-1910)
146
+ ## or (-1914) or (-2011) etc.
147
+
148
+ ###
149
+ ## todo: move year out of canonical team name - why? why not?
150
+
151
+ ## check if canonical name include (2011-) or similar in name
152
+ ## if yes, remove (2011-) and add to (alt) names
153
+ ## e.g. New York FC (2011) => New York FC
154
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
+
157
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
+ rec.year = $1.to_i
159
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
+ rec.year_end = $1.to_i
161
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
+ rec.year = $1.to_i
163
+ rec.year_end = $2.to_i
164
+ else
165
+ ## todo/check: warn about unknown year format
166
+ end
167
+ end
168
+
169
+ ## todo/check - check for unknown format values
170
+ ## e.g. too many values, duplicate years, etc.
171
+ ## check for overwritting, etc.
172
+ while values.size > 0
173
+ value = values.shift
174
+ ## strip and squish (white)spaces
175
+ # e.g. León › Guanajuato => León › Guanajuato
176
+ value = value.strip.gsub( /[ \t]+/, ' ' )
177
+ if value =~/^\d{4}$/ # e.g 1904
178
+ ## todo/check: issue warning if year is already set!!!!!!!
179
+ if rec.year
180
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
+ pp rec
182
+ exit 1
183
+ end
184
+ rec.year = value.to_i
185
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
186
+ ## cut-off leading @ and spaces
187
+ rec.ground = value[1..-1].strip
188
+ else
189
+ ## assume city / geo tree
190
+ ## split into geo tree
191
+ geos = split_geo( value )
192
+ city = geos[0]
193
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
+ rec.district = $1.strip
196
+ city = city.gsub( /\(.+?\)/, '' ).strip
197
+ end
198
+ rec.city = city
199
+
200
+ if geos.size > 1
201
+ ## cut-off city and keep the rest (of geo tree)
202
+ rec.geos = geos[1..-1]
203
+ end
204
+ end
205
+ end ## while values
206
+
207
+
208
+ ###############
209
+ ## use headings text for geo tree
210
+
211
+ ## 1) add country if present
212
+ if headings.size > 0 && headings[0]
213
+ country = SportDb::Import.config.countries[ headings[0] ]
214
+ rec.country = country
215
+ else
216
+ ## make it an error - why? why not?
217
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
+ exit 1
219
+ end
220
+
221
+ ## 2) check geo tree with headings hierarchy
222
+ if headings.size > 1 && headings[1]
223
+ geos = split_geo( headings[1] )
224
+ if rec.geos
225
+ if rec.geos[0] != geos[0]
226
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
+ exit 1
228
+ end
229
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
+ exit 1
232
+ end
233
+ else
234
+ ## add missing region (state/province) from headings hierarchy
235
+ rec.geos = geos
236
+ end
237
+ end
238
+
239
+ last_rec = rec
240
+
241
+
242
+ ### todo/fix:
243
+ ## auto-add alt name with dots stripped - why? why not?
244
+ ## e.g. D.C. United => DC United
245
+ ## e.g. Liverpool F.C. => Liverpool FC
246
+ ## e.g. St. Albin => St Albin etc.
247
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
+
249
+ ##
250
+ ## todo/fix: unify mapping entries
251
+ ## always lowercase !!!! (case insensitive)
252
+ ## always strip (2011-) !!!
253
+ ## always strip dots (e.g. St., F.C, etc.)
254
+
255
+ recs << rec
256
+ end
257
+ end # each_line
258
+ recs
259
+ end # method read
260
+
261
+ ### helpers
262
+ def self.split_geo( str )
263
+ ## assume city / geo tree
264
+ ## strip and squish (white)spaces
265
+ # e.g. León › Guanajuato => León › Guanajuato
266
+ str = str.strip.gsub( /[ \t]+/, ' ' )
267
+
268
+ ## split into geo tree
269
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
+ geos
272
+ end
273
+
274
+ end # class ClubReader
275
+
276
+
277
+ end ## module Import
278
+ end ## module SportDb