sportdb-config 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,278 +1,278 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ).read
13
- parse( txt )
14
- end
15
-
16
-
17
- def self.parse( txt )
18
- recs = []
19
- last_rec = nil
20
- headings = [] ## headings stack
21
-
22
- txt.each_line do |line|
23
- line = line.strip
24
-
25
- next if line.empty?
26
- next if line.start_with?( '#' ) ## skip comments too
27
-
28
- ## strip inline (until end-of-line) comments too
29
- ## e.g Eupen => KAS Eupen, ## [de]
30
- ## => Eupen => KAS Eupen,
31
- line = line.sub( /#.*/, '' ).strip
32
- pp line
33
-
34
-
35
- next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
-
37
- ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
- ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
- if line =~ /^(={1,}) ## leading ======
40
- ([^=]+?) ## text (note: for now no "inline" = allowed)
41
- =* ## (optional) trailing ====
42
- $/x
43
- heading_marker = $1
44
- heading_level = $1.length ## count number of = for heading level
45
- heading = $2.strip
46
-
47
- puts "heading #{heading_level} >#{heading}<"
48
-
49
- ## 1) first pop headings if present
50
- while headings.size+1 > heading_level
51
- headings.pop
52
- end
53
-
54
- ## 2) add missing (hierarchy) level if
55
- while headings.size+1 < heading_level
56
- ## todo/fix: issue warning about "skipping" hierarchy level
57
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
- headings.push( nil )
59
- end
60
-
61
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
- ## keep level empty
63
- else
64
-
65
- ## quick hack: if level is 1 assume country for now
66
- ## and extract country code e.g.
67
- ## Austria (at) => at
68
- ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
- if heading_level == 1
70
- if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
- country_code = $1
72
-
73
- ## check country code - MUST exist for now!!!!
74
- country = SportDb::Import.config.countries[ country_code ]
75
- if country.nil?
76
- puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
- exit 1
78
- end
79
-
80
- headings.push( country_code )
81
- else
82
- puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
- exit 1
84
- end
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif line.start_with?( '|' )
108
- ## assume continuation with line of alternative names
109
- ## note: skip leading pipe
110
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
- ## strip and squish (white)spaces
112
- # e.g. New York FC (2011-) => New York FC (2011-)
113
- values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
- last_rec.alt_names += values
115
- last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
-
117
- ## check for duplicates
118
- if last_rec.duplicates?
119
- duplicates = last_rec.duplicates
120
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
- pp duplicates
122
- pp last_rec
123
- ##
124
- ## todo/fix: make it only an error with exit 1
125
- ## if (not normalized) names are the same (not unique/uniq)
126
- ## e.g. don't exit on A.F.C. == AFC etc.
127
- ## exit 1
128
- end
129
- else
130
- values = line.split( ',' )
131
-
132
- rec = Club.new
133
- value = values.shift ## get first item
134
- ## strip and squish (white)spaces
135
- # e.g. New York FC (2011-) => New York FC (2011-)
136
- value = value.strip.gsub( /[ \t]+/, ' ' )
137
- rec.name = value # canoncial name (global unique "beautiful/long" name)
138
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
-
140
- ## note:
141
- ## check/todo!!!!!!!!!!!!!!!!!-
142
- ## strip year if to present e.g. (2011-)
143
- ##
144
- ## do NOT strip for defunct / historic clubs e.g.
145
- ## (1899-1910)
146
- ## or (-1914) or (-2011) etc.
147
-
148
- ###
149
- ## todo: move year out of canonical team name - why? why not?
150
-
151
- ## check if canonical name include (2011-) or similar in name
152
- ## if yes, remove (2011-) and add to (alt) names
153
- ## e.g. New York FC (2011) => New York FC
154
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
-
157
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
- rec.year = $1.to_i
159
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
- rec.year_end = $1.to_i
161
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
- rec.year = $1.to_i
163
- rec.year_end = $2.to_i
164
- else
165
- ## todo/check: warn about unknown year format
166
- end
167
- end
168
-
169
- ## todo/check - check for unknown format values
170
- ## e.g. too many values, duplicate years, etc.
171
- ## check for overwritting, etc.
172
- while values.size > 0
173
- value = values.shift
174
- ## strip and squish (white)spaces
175
- # e.g. León › Guanajuato => León › Guanajuato
176
- value = value.strip.gsub( /[ \t]+/, ' ' )
177
- if value =~/^\d{4}$/ # e.g 1904
178
- ## todo/check: issue warning if year is already set!!!!!!!
179
- if rec.year
180
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
- pp rec
182
- exit 1
183
- end
184
- rec.year = value.to_i
185
- elsif value.start_with?( '@' ) # e.g. @ Anfield
186
- ## cut-off leading @ and spaces
187
- rec.ground = value[1..-1].strip
188
- else
189
- ## assume city / geo tree
190
- ## split into geo tree
191
- geos = split_geo( value )
192
- city = geos[0]
193
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
- rec.district = $1.strip
196
- city = city.gsub( /\(.+?\)/, '' ).strip
197
- end
198
- rec.city = city
199
-
200
- if geos.size > 1
201
- ## cut-off city and keep the rest (of geo tree)
202
- rec.geos = geos[1..-1]
203
- end
204
- end
205
- end ## while values
206
-
207
-
208
- ###############
209
- ## use headings text for geo tree
210
-
211
- ## 1) add country if present
212
- if headings.size > 0 && headings[0]
213
- country = SportDb::Import.config.countries[ headings[0] ]
214
- rec.country = country
215
- else
216
- ## make it an error - why? why not?
217
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
- exit 1
219
- end
220
-
221
- ## 2) check geo tree with headings hierarchy
222
- if headings.size > 1 && headings[1]
223
- geos = split_geo( headings[1] )
224
- if rec.geos
225
- if rec.geos[0] != geos[0]
226
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
- exit 1
228
- end
229
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
- exit 1
232
- end
233
- else
234
- ## add missing region (state/province) from headings hierarchy
235
- rec.geos = geos
236
- end
237
- end
238
-
239
- last_rec = rec
240
-
241
-
242
- ### todo/fix:
243
- ## auto-add alt name with dots stripped - why? why not?
244
- ## e.g. D.C. United => DC United
245
- ## e.g. Liverpool F.C. => Liverpool FC
246
- ## e.g. St. Albin => St Albin etc.
247
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
-
249
- ##
250
- ## todo/fix: unify mapping entries
251
- ## always lowercase !!!! (case insensitive)
252
- ## always strip (2011-) !!!
253
- ## always strip dots (e.g. St., F.C, etc.)
254
-
255
- recs << rec
256
- end
257
- end # each_line
258
- recs
259
- end # method read
260
-
261
- ### helpers
262
- def self.split_geo( str )
263
- ## assume city / geo tree
264
- ## strip and squish (white)spaces
265
- # e.g. León › Guanajuato => León › Guanajuato
266
- str = str.strip.gsub( /[ \t]+/, ' ' )
267
-
268
- ## split into geo tree
269
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
- geos
272
- end
273
-
274
- end # class ClubReader
275
-
276
-
277
- end ## module Import
278
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+
11
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
+ txt = File.open( path, 'r:utf-8' ).read
13
+ parse( txt )
14
+ end
15
+
16
+
17
+ def self.parse( txt )
18
+ recs = []
19
+ last_rec = nil
20
+ headings = [] ## headings stack
21
+
22
+ txt.each_line do |line|
23
+ line = line.strip
24
+
25
+ next if line.empty?
26
+ next if line.start_with?( '#' ) ## skip comments too
27
+
28
+ ## strip inline (until end-of-line) comments too
29
+ ## e.g Eupen => KAS Eupen, ## [de]
30
+ ## => Eupen => KAS Eupen,
31
+ line = line.sub( /#.*/, '' ).strip
32
+ pp line
33
+
34
+
35
+ next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
+
37
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
+ ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
+ if line =~ /^(={1,}) ## leading ======
40
+ ([^=]+?) ## text (note: for now no "inline" = allowed)
41
+ =* ## (optional) trailing ====
42
+ $/x
43
+ heading_marker = $1
44
+ heading_level = $1.length ## count number of = for heading level
45
+ heading = $2.strip
46
+
47
+ puts "heading #{heading_level} >#{heading}<"
48
+
49
+ ## 1) first pop headings if present
50
+ while headings.size+1 > heading_level
51
+ headings.pop
52
+ end
53
+
54
+ ## 2) add missing (hierarchy) level if
55
+ while headings.size+1 < heading_level
56
+ ## todo/fix: issue warning about "skipping" hierarchy level
57
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
+ headings.push( nil )
59
+ end
60
+
61
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
+ ## keep level empty
63
+ else
64
+
65
+ ## quick hack: if level is 1 assume country for now
66
+ ## and extract country code e.g.
67
+ ## Austria (at) => at
68
+ ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
+ if heading_level == 1
70
+ if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
+ country_code = $1
72
+
73
+ ## check country code - MUST exist for now!!!!
74
+ country = SportDb::Import.config.countries[ country_code ]
75
+ if country.nil?
76
+ puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
+ exit 1
78
+ end
79
+
80
+ headings.push( country_code )
81
+ else
82
+ puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
+ exit 1
84
+ end
85
+ else
86
+ ## quick hack:
87
+ ## remove known fill/dummy words incl:
88
+ ## Provincia San Juan => San Juan (see argentina, for example)
89
+ ##
90
+ ## use geo tree long term with alternative names - why? why not?
91
+ words = ['Provincia']
92
+ words.each { |word| heading = heading.gsub( word, '' ) }
93
+ heading = heading.strip
94
+
95
+ headings.push( heading )
96
+ end
97
+
98
+ ## assert that hierarchy level is ok
99
+ if headings.size != heading_level
100
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
+ exit 1
102
+ end
103
+ end
104
+
105
+ pp headings
106
+
107
+ elsif line.start_with?( '|' )
108
+ ## assume continuation with line of alternative names
109
+ ## note: skip leading pipe
110
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
+ ## strip and squish (white)spaces
112
+ # e.g. New York FC (2011-) => New York FC (2011-)
113
+ values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
+ last_rec.alt_names += values
115
+ last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
+
117
+ ## check for duplicates
118
+ if last_rec.duplicates?
119
+ duplicates = last_rec.duplicates
120
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
+ pp duplicates
122
+ pp last_rec
123
+ ##
124
+ ## todo/fix: make it only an error with exit 1
125
+ ## if (not normalized) names are the same (not unique/uniq)
126
+ ## e.g. don't exit on A.F.C. == AFC etc.
127
+ ## exit 1
128
+ end
129
+ else
130
+ values = line.split( ',' )
131
+
132
+ rec = Club.new
133
+ value = values.shift ## get first item
134
+ ## strip and squish (white)spaces
135
+ # e.g. New York FC (2011-) => New York FC (2011-)
136
+ value = value.strip.gsub( /[ \t]+/, ' ' )
137
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
138
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
+
140
+ ## note:
141
+ ## check/todo!!!!!!!!!!!!!!!!!-
142
+ ## strip year if to present e.g. (2011-)
143
+ ##
144
+ ## do NOT strip for defunct / historic clubs e.g.
145
+ ## (1899-1910)
146
+ ## or (-1914) or (-2011) etc.
147
+
148
+ ###
149
+ ## todo: move year out of canonical team name - why? why not?
150
+
151
+ ## check if canonical name include (2011-) or similar in name
152
+ ## if yes, remove (2011-) and add to (alt) names
153
+ ## e.g. New York FC (2011) => New York FC
154
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
+
157
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
+ rec.year = $1.to_i
159
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
+ rec.year_end = $1.to_i
161
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
+ rec.year = $1.to_i
163
+ rec.year_end = $2.to_i
164
+ else
165
+ ## todo/check: warn about unknown year format
166
+ end
167
+ end
168
+
169
+ ## todo/check - check for unknown format values
170
+ ## e.g. too many values, duplicate years, etc.
171
+ ## check for overwritting, etc.
172
+ while values.size > 0
173
+ value = values.shift
174
+ ## strip and squish (white)spaces
175
+ # e.g. León › Guanajuato => León › Guanajuato
176
+ value = value.strip.gsub( /[ \t]+/, ' ' )
177
+ if value =~/^\d{4}$/ # e.g 1904
178
+ ## todo/check: issue warning if year is already set!!!!!!!
179
+ if rec.year
180
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
+ pp rec
182
+ exit 1
183
+ end
184
+ rec.year = value.to_i
185
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
186
+ ## cut-off leading @ and spaces
187
+ rec.ground = value[1..-1].strip
188
+ else
189
+ ## assume city / geo tree
190
+ ## split into geo tree
191
+ geos = split_geo( value )
192
+ city = geos[0]
193
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
+ rec.district = $1.strip
196
+ city = city.gsub( /\(.+?\)/, '' ).strip
197
+ end
198
+ rec.city = city
199
+
200
+ if geos.size > 1
201
+ ## cut-off city and keep the rest (of geo tree)
202
+ rec.geos = geos[1..-1]
203
+ end
204
+ end
205
+ end ## while values
206
+
207
+
208
+ ###############
209
+ ## use headings text for geo tree
210
+
211
+ ## 1) add country if present
212
+ if headings.size > 0 && headings[0]
213
+ country = SportDb::Import.config.countries[ headings[0] ]
214
+ rec.country = country
215
+ else
216
+ ## make it an error - why? why not?
217
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
+ exit 1
219
+ end
220
+
221
+ ## 2) check geo tree with headings hierarchy
222
+ if headings.size > 1 && headings[1]
223
+ geos = split_geo( headings[1] )
224
+ if rec.geos
225
+ if rec.geos[0] != geos[0]
226
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
+ exit 1
228
+ end
229
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
+ exit 1
232
+ end
233
+ else
234
+ ## add missing region (state/province) from headings hierarchy
235
+ rec.geos = geos
236
+ end
237
+ end
238
+
239
+ last_rec = rec
240
+
241
+
242
+ ### todo/fix:
243
+ ## auto-add alt name with dots stripped - why? why not?
244
+ ## e.g. D.C. United => DC United
245
+ ## e.g. Liverpool F.C. => Liverpool FC
246
+ ## e.g. St. Albin => St Albin etc.
247
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
+
249
+ ##
250
+ ## todo/fix: unify mapping entries
251
+ ## always lowercase !!!! (case insensitive)
252
+ ## always strip (2011-) !!!
253
+ ## always strip dots (e.g. St., F.C, etc.)
254
+
255
+ recs << rec
256
+ end
257
+ end # each_line
258
+ recs
259
+ end # method read
260
+
261
+ ### helpers
262
+ def self.split_geo( str )
263
+ ## assume city / geo tree
264
+ ## strip and squish (white)spaces
265
+ # e.g. León › Guanajuato => León › Guanajuato
266
+ str = str.strip.gsub( /[ \t]+/, ' ' )
267
+
268
+ ## split into geo tree
269
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
+ geos
272
+ end
273
+
274
+ end # class ClubReader
275
+
276
+
277
+ end ## module Import
278
+ end ## module SportDb