sportdb-formats 1.1.5 → 1.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,134 +1,134 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class ClubHistoryIndex
8
-
9
- def self.build( path )
10
- pack = Package.new( path ) ## lets us use direcotry or zip archive
11
-
12
- recs = []
13
- pack.each_clubs_history do |entry|
14
- recs += ClubHistoryReader.parse( entry.read )
15
- end
16
- recs
17
-
18
- index = new
19
- index.add( recs )
20
- index
21
- end
22
-
23
-
24
-
25
- def catalog() Import.catalog; end
26
-
27
- ## note: keep name history for now separate from
28
- ## from club struct - why? why not?
29
- ## later yes, yes, yes, merge name history into club struct!!!!!
30
- ##
31
- ## for now the name history is experimental
32
-
33
-
34
- def initialize
35
- @clubs = {} ## clubs (indexed) by canonical name
36
- @errors = []
37
- end
38
-
39
- attr_reader :errors
40
- def errors?() @errors.empty? == false; end
41
-
42
- def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
43
-
44
-
45
- def add_history( club_rec, keyword, season, args )
46
- ## note use season obj for now (and NOT key) - why? why not?
47
- rec = @clubs[ club_rec.name ] ||= []
48
-
49
- rec << [season, [keyword, args]]
50
-
51
- ## note: always keep records sorted by season_key for now
52
- ## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
53
- rec.sort! { |l,r| r[0] <=> l[0] }
54
- end
55
-
56
-
57
- def add( rec_or_recs ) ## add club record / alt_names
58
- recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
59
-
60
- recs.each do |rec|
61
-
62
- keyword = rec[0]
63
- season_key = rec[1]
64
- args = rec[2..-1] ## get rest of args e.g. one, two or more
65
-
66
- ## note: for now only add (re)name history season records,
67
- ## that is, skip MERGE and BANKRUPT for now
68
- ## and incl. only RENAME, REFORM, MOVE for now
69
- next if ['MERGE', 'BANKRUPT'].include?( keyword )
70
-
71
-
72
- name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
73
- name_new = strip_geo( args[1][0] )
74
-
75
- country_old = args[0][1]
76
- country_new = args[1][1]
77
-
78
- club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
79
- club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
80
-
81
- ## note use season obj for now (and NOT key) - why? why not?
82
- season = Season.parse( season_key )
83
-
84
- ## todo/check:
85
- ## check if club_old and club_new reference different club record!!
86
- ## examples - RB II -> Liefering ?? or
87
- ## FC Pasching -> OOE Juniors ??
88
- ## Austria Salzburg -> RB Salburg ??
89
- ## for now always add name history to both - why? why not?
90
-
91
- add_history( club_old, keyword, season, args )
92
- ## note: allow for now different club references
93
- ## but maybe warn later - why? why not?
94
- ## add history to both for now
95
- add_history( club_new, keyword, season, args ) if club_old != club_new
96
- end # each rec
97
- end # method add
98
-
99
-
100
- #### todo/check: move as method to club struct later - to always use club reference
101
- ## returns (simply) name as string for now or nil - why? why not?
102
- #
103
- # history entry example
104
- # Arsenal FC"=>
105
- # [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
106
- # [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
107
- # [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
108
- def find_name_by( name:, season: )
109
- recs = @clubs[ name ]
110
- if recs
111
- season = Season( season ) ## make sure season is a season obj (and NOT a string)
112
- ## check season records for name; use linear search (assume only few records)
113
- recs.each do |rec|
114
- if season >= rec[0]
115
- return strip_geo( rec[1][1][1][0] ) # use second arg
116
- end
117
- end
118
- ## if we get here use last name
119
- strip_geo( recs[-1][1][1][0][0] ) # use first arg
120
- else
121
- nil
122
- end
123
- end
124
-
125
- ##################
126
- ## helpers
127
- def strip_geo( name )
128
- ## e.g. Arsenal, London => Arsenal
129
- name.split(',')[0].strip
130
- end
131
- end # class ClubHistoryIndex
132
-
133
- end # module Import
134
- end # module SportDb
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class ClubHistoryIndex
8
+
9
+ def self.build( path )
10
+ pack = Package.new( path ) ## lets us use direcotry or zip archive
11
+
12
+ recs = []
13
+ pack.each_clubs_history do |entry|
14
+ recs += ClubHistoryReader.parse( entry.read )
15
+ end
16
+ recs
17
+
18
+ index = new
19
+ index.add( recs )
20
+ index
21
+ end
22
+
23
+
24
+
25
+ def catalog() Import.catalog; end
26
+
27
+ ## note: keep name history for now separate from
28
+ ## from club struct - why? why not?
29
+ ## later yes, yes, yes, merge name history into club struct!!!!!
30
+ ##
31
+ ## for now the name history is experimental
32
+
33
+
34
+ def initialize
35
+ @clubs = {} ## clubs (indexed) by canonical name
36
+ @errors = []
37
+ end
38
+
39
+ attr_reader :errors
40
+ def errors?() @errors.empty? == false; end
41
+
42
+ def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
43
+
44
+
45
+ def add_history( club_rec, keyword, season, args )
46
+ ## note use season obj for now (and NOT key) - why? why not?
47
+ rec = @clubs[ club_rec.name ] ||= []
48
+
49
+ rec << [season, [keyword, args]]
50
+
51
+ ## note: always keep records sorted by season_key for now
52
+ ## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
53
+ rec.sort! { |l,r| r[0] <=> l[0] }
54
+ end
55
+
56
+
57
+ def add( rec_or_recs ) ## add club record / alt_names
58
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
59
+
60
+ recs.each do |rec|
61
+
62
+ keyword = rec[0]
63
+ season_key = rec[1]
64
+ args = rec[2..-1] ## get rest of args e.g. one, two or more
65
+
66
+ ## note: for now only add (re)name history season records,
67
+ ## that is, skip MERGE and BANKRUPT for now
68
+ ## and incl. only RENAME, REFORM, MOVE for now
69
+ next if ['MERGE', 'BANKRUPT'].include?( keyword )
70
+
71
+
72
+ name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
73
+ name_new = strip_geo( args[1][0] )
74
+
75
+ country_old = args[0][1]
76
+ country_new = args[1][1]
77
+
78
+ club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
79
+ club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
80
+
81
+ ## note use season obj for now (and NOT key) - why? why not?
82
+ season = Season.parse( season_key )
83
+
84
+ ## todo/check:
85
+ ## check if club_old and club_new reference different club record!!
86
+ ## examples - RB II -> Liefering ?? or
87
+ ## FC Pasching -> OOE Juniors ??
88
+ ## Austria Salzburg -> RB Salburg ??
89
+ ## for now always add name history to both - why? why not?
90
+
91
+ add_history( club_old, keyword, season, args )
92
+ ## note: allow for now different club references
93
+ ## but maybe warn later - why? why not?
94
+ ## add history to both for now
95
+ add_history( club_new, keyword, season, args ) if club_old != club_new
96
+ end # each rec
97
+ end # method add
98
+
99
+
100
+ #### todo/check: move as method to club struct later - to always use club reference
101
+ ## returns (simply) name as string for now or nil - why? why not?
102
+ #
103
+ # history entry example
104
+ # Arsenal FC"=>
105
+ # [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
106
+ # [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
107
+ # [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
108
+ def find_name_by( name:, season: )
109
+ recs = @clubs[ name ]
110
+ if recs
111
+ season = Season( season ) ## make sure season is a season obj (and NOT a string)
112
+ ## check season records for name; use linear search (assume only few records)
113
+ recs.each do |rec|
114
+ if season >= rec[0]
115
+ return strip_geo( rec[1][1][1][0] ) # use second arg
116
+ end
117
+ end
118
+ ## if we get here use last name
119
+ strip_geo( recs[-1][1][1][0][0] ) # use first arg
120
+ else
121
+ nil
122
+ end
123
+ end
124
+
125
+ ##################
126
+ ## helpers
127
+ def strip_geo( name )
128
+ ## e.g. Arsenal, London => Arsenal
129
+ name.split(',')[0].strip
130
+ end
131
+ end # class ClubHistoryIndex
132
+
133
+ end # module Import
134
+ end # module SportDb
@@ -1,350 +1,350 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
- def catalog() Import.catalog; end
11
-
12
-
13
-
14
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
- parse( txt )
17
- end
18
-
19
- def self.parse( txt )
20
- new( txt ).parse
21
- end
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
- ## pattern for b (child) team / club marker e.g.
28
- ## (ii) or ii) or ii.) or (ii.) or (II)
29
- ## (b) or b) or b.) or (b.) or (B)
30
- ## (2) or 2) or 2.) or (2.)
31
- B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
- (?: ii|b|2 )
33
- \.? # optional dot - keep and allow dot - why? why not?
34
- \) # required closing bracket
35
- }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
-
37
- ## pattern for checking for address line e.g.
38
- ## use just one style / syntax - why? why not?
39
- ## Fischhofgasse 12 ~ 1100 Wien or
40
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
- (?: ~ | /{2,} | \+{2,} )
44
- (?: [ ]|$) # space or end of line
45
- }x
46
-
47
-
48
- def add_alt_names( rec, names ) ## helper for adding alternat names
49
-
50
- ## strip and squish (white)spaces
51
- # e.g. New York FC (2011-) => New York FC (2011-)
52
- names = names.map { |name| name.gsub( '$', '' ).strip
53
- .gsub( /[ \t]+/, ' ' ) }
54
- rec.alt_names += names
55
- rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
-
57
- ## check for duplicates
58
- if rec.duplicates?
59
- duplicates = rec.duplicates
60
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
- pp duplicates
62
- pp rec
63
- ##
64
- ## todo/fix: make it only an error with exit 1
65
- ## if (not normalized) names are the same (not unique/uniq)
66
- ## e.g. don't exit on A.F.C. == AFC etc.
67
- ## exit 1
68
- end
69
- end
70
-
71
-
72
- def parse
73
- recs = []
74
- last_rec = nil
75
- headings = [] ## headings stack
76
-
77
- OutlineReader.parse( @txt ).each do |node|
78
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
- heading_level = node[0][1].to_i
80
- heading = node[1]
81
-
82
- puts "heading #{heading_level} >#{heading}<"
83
-
84
- ## 1) first pop headings if present
85
- while headings.size+1 > heading_level
86
- headings.pop
87
- end
88
-
89
- ## 2) add missing (hierarchy) level if
90
- while headings.size+1 < heading_level
91
- ## todo/fix: issue warning about "skipping" hierarchy level
92
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
- headings.push( nil )
94
- end
95
-
96
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
- ## keep level empty
98
- else
99
- ## note: if level is 1 assume country for now
100
- if heading_level == 1
101
- ## assume country in heading; allow all "formats" supported by parse e.g.
102
- ## Österreich • Austria (at)
103
- ## Österreich • Austria
104
- ## Austria
105
- ## Deutschland (de) • Germany
106
- country = catalog.countries.parse( heading )
107
- ## check country code - MUST exist for now!!!!
108
- if country.nil?
109
- puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
- exit 1
111
- end
112
-
113
- headings.push( country.key )
114
- else
115
- ## quick hack:
116
- ## remove known fill/dummy words incl:
117
- ## Provincia San Juan => San Juan (see argentina, for example)
118
- ##
119
- ## use geo tree long term with alternative names - why? why not?
120
- words = ['Provincia']
121
- words.each { |word| heading = heading.gsub( word, '' ) }
122
- heading = heading.strip
123
-
124
- headings.push( heading )
125
- end
126
-
127
- ## assert that hierarchy level is ok
128
- if headings.size != heading_level
129
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
- exit 1
131
- end
132
- end
133
-
134
- pp headings
135
-
136
- elsif node[0] == :p ## paragraph with (text) lines
137
- lines = node[1]
138
- lines.each do |line|
139
- if line.start_with?( '|' )
140
- ## assume continuation with line of alternative names
141
- ## note: skip leading pipe
142
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
-
144
- add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
-
146
- ## check for b (child) team / club marker e.g.
147
- ## (ii) or ii) or ii.) or (ii.)
148
- ## (b) or b) or b.) or (b.)
149
- ## (2) or 2) or 2.) or (2.)
150
- elsif line =~ B_TEAM_MARKER_RE
151
- line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
-
153
- ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
- # for both a and b team / club
155
- rec = Club.new
156
- value = line ## note: assume / allow just canonical name for now
157
- ## strip and squish (white)spaces
158
- # e.g. New York FC (2011-) => New York FC (2011-)
159
- value = value.gsub( '$', '' ).strip
160
- .gsub( /[ \t]+/, ' ' )
161
-
162
- rec.name = value # canoncial name (global unique "beautiful/long" name)
163
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
-
165
- ### link a and b team / clubs
166
- ## assume last_rec is the a team
167
- ## todo/fix: check last_rec required NOT null
168
- rec.a = last_rec
169
- last_rec.b = rec
170
-
171
- last_rec = rec
172
- recs << rec
173
-
174
- ## check for address line e.g.
175
- ## use just one style / syntax - why? why not?
176
- ## Fischhofgasse 12 ~ 1100 Wien or
177
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
- elsif line =~ ADDR_MARKER_RE
180
- # note skip for now!!!
181
- # todo/fix: add support for address line!!!
182
- puts " skipping address line for now >#{line}<"
183
- else
184
- values = line.split( ',' )
185
-
186
- rec = Club.new
187
-
188
- col = values.shift ## get first item
189
- ## note: allow optional alt names for convenience with required canoncial name
190
- names = col.split( '|' ) # team names - allow/use pipe(|)
191
- value = names[0] ## canonical name
192
- alt_names = names[1..-1] ## optional (inline) alt names
193
-
194
- ## strip and squish (white)spaces
195
- # e.g. New York FC (2011-) => New York FC (2011-)
196
- value = value.gsub( '$', '' ).strip
197
- .gsub( /[ \t]+/, ' ' )
198
- rec.name = value # canoncial name (global unique "beautiful/long" name)
199
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
-
201
- ## note: add optional (inline) alternate names if present
202
- add_alt_names( rec, alt_names ) if alt_names.size > 0
203
-
204
- ## note:
205
- ## check/todo!!!!!!!!!!!!!!!!!-
206
- ## strip year if to present e.g. (2011-)
207
- ##
208
- ## do NOT strip for defunct / historic clubs e.g.
209
- ## (1899-1910)
210
- ## or (-1914) or (-2011) etc.
211
-
212
- ###
213
- ## todo: move year out of canonical team name - why? why not?
214
-
215
- ## check if canonical name include (2011-) or similar in name
216
- ## if yes, remove (2011-) and add to (alt) names
217
- ## e.g. New York FC (2011) => New York FC
218
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
-
221
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
- rec.year = $1.to_i
223
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
- rec.year_end = $1.to_i
225
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
- rec.year = $1.to_i
227
- rec.year_end = $2.to_i
228
- else
229
- ## todo/check: warn about unknown year format
230
- end
231
- end
232
-
233
- ## todo/check - check for unknown format values
234
- ## e.g. too many values, duplicate years, etc.
235
- ## check for overwritting, etc.
236
- while values.size > 0
237
- value = values.shift
238
- ## strip and squish (white)spaces
239
- # e.g. León › Guanajuato => León › Guanajuato
240
- value = value.strip.gsub( /[ \t]+/, ' ' )
241
- if value =~/^\d{4}$/ # e.g 1904
242
- ## todo/check: issue warning if year is already set!!!!!!!
243
- if rec.year
244
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
- pp rec
246
- exit 1
247
- end
248
- rec.year = value.to_i
249
- elsif value.start_with?( '@' ) # e.g. @ Anfield
250
- ## cut-off leading @ and spaces
251
- rec.ground = value[1..-1].strip
252
- else
253
- ## assume city / geo tree
254
- ## split into geo tree
255
- geos = split_geo( value )
256
- city = geos[0]
257
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
- rec.district = $1.strip
260
- city = city.gsub( /\(.+?\)/, '' ).strip
261
- end
262
- rec.city = city
263
-
264
- if geos.size > 1
265
- ## cut-off city and keep the rest (of geo tree)
266
- rec.geos = geos[1..-1]
267
- end
268
- end
269
- end ## while values
270
-
271
-
272
- ###############
273
- ## use headings text for geo tree
274
-
275
- ## 1) add country if present
276
- if headings.size > 0 && headings[0]
277
- country = catalog.countries.find( headings[0] )
278
- rec.country = country
279
- else
280
- ## make it an error - why? why not?
281
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
- exit 1
283
- end
284
-
285
- ## 2) check geo tree with headings hierarchy
286
- if headings.size > 1 && headings[1]
287
- geos = split_geo( headings[1] )
288
- if rec.geos
289
- if rec.geos[0] != geos[0]
290
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
- exit 1
292
- end
293
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
- exit 1
296
- end
297
- else
298
- ## add missing region (state/province) from headings hierarchy
299
- rec.geos = geos
300
- end
301
- end
302
-
303
- last_rec = rec
304
-
305
-
306
- ### todo/fix:
307
- ## auto-add alt name with dots stripped - why? why not?
308
- ## e.g. D.C. United => DC United
309
- ## e.g. Liverpool F.C. => Liverpool FC
310
- ## e.g. St. Albin => St Albin etc.
311
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
-
313
- ##
314
- ## todo/fix: unify mapping entries
315
- ## always lowercase !!!! (case insensitive)
316
- ## always strip (2011-) !!!
317
- ## always strip dots (e.g. St., F.C, etc.)
318
-
319
- recs << rec
320
- end
321
- end # each line (in paragraph)
322
- else
323
- puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
- pp node
325
- exit 1
326
- end
327
- end
328
-
329
- recs
330
- end # method read
331
-
332
- #######################
333
- ### helpers
334
- def split_geo( str )
335
- ## assume city / geo tree
336
- ## strip and squish (white)spaces
337
- # e.g. León › Guanajuato => León › Guanajuato
338
- str = str.strip.gsub( /[ \t]+/, ' ' )
339
-
340
- ## split into geo tree
341
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
- geos
344
- end
345
-
346
- end # class ClubReader
347
-
348
-
349
- end ## module Import
350
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+
14
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
+ parse( txt )
17
+ end
18
+
19
+ def self.parse( txt )
20
+ new( txt ).parse
21
+ end
22
+
23
+ def initialize( txt )
24
+ @txt = txt
25
+ end
26
+
27
+ ## pattern for b (child) team / club marker e.g.
28
+ ## (ii) or ii) or ii.) or (ii.) or (II)
29
+ ## (b) or b) or b.) or (b.) or (B)
30
+ ## (2) or 2) or 2.) or (2.)
31
+ B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
+ (?: ii|b|2 )
33
+ \.? # optional dot - keep and allow dot - why? why not?
34
+ \) # required closing bracket
35
+ }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
+
37
+ ## pattern for checking for address line e.g.
38
+ ## use just one style / syntax - why? why not?
39
+ ## Fischhofgasse 12 ~ 1100 Wien or
40
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
+ ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
+ (?: ~ | /{2,} | \+{2,} )
44
+ (?: [ ]|$) # space or end of line
45
+ }x
46
+
47
+
48
+ def add_alt_names( rec, names ) ## helper for adding alternat names
49
+
50
+ ## strip and squish (white)spaces
51
+ # e.g. New York FC (2011-) => New York FC (2011-)
52
+ names = names.map { |name| name.gsub( '$', '' ).strip
53
+ .gsub( /[ \t]+/, ' ' ) }
54
+ rec.alt_names += names
55
+ rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
+
57
+ ## check for duplicates
58
+ if rec.duplicates?
59
+ duplicates = rec.duplicates
60
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
+ pp duplicates
62
+ pp rec
63
+ ##
64
+ ## todo/fix: make it only an error with exit 1
65
+ ## if (not normalized) names are the same (not unique/uniq)
66
+ ## e.g. don't exit on A.F.C. == AFC etc.
67
+ ## exit 1
68
+ end
69
+ end
70
+
71
+
72
+ def parse
73
+ recs = []
74
+ last_rec = nil
75
+ headings = [] ## headings stack
76
+
77
+ OutlineReader.parse( @txt ).each do |node|
78
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
+ heading_level = node[0][1].to_i
80
+ heading = node[1]
81
+
82
+ puts "heading #{heading_level} >#{heading}<"
83
+
84
+ ## 1) first pop headings if present
85
+ while headings.size+1 > heading_level
86
+ headings.pop
87
+ end
88
+
89
+ ## 2) add missing (hierarchy) level if
90
+ while headings.size+1 < heading_level
91
+ ## todo/fix: issue warning about "skipping" hierarchy level
92
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
+ headings.push( nil )
94
+ end
95
+
96
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
+ ## keep level empty
98
+ else
99
+ ## note: if level is 1 assume country for now
100
+ if heading_level == 1
101
+ ## assume country in heading; allow all "formats" supported by parse e.g.
102
+ ## Österreich • Austria (at)
103
+ ## Österreich • Austria
104
+ ## Austria
105
+ ## Deutschland (de) • Germany
106
+ country = catalog.countries.parse( heading )
107
+ ## check country code - MUST exist for now!!!!
108
+ if country.nil?
109
+ puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
+ exit 1
111
+ end
112
+
113
+ headings.push( country.key )
114
+ else
115
+ ## quick hack:
116
+ ## remove known fill/dummy words incl:
117
+ ## Provincia San Juan => San Juan (see argentina, for example)
118
+ ##
119
+ ## use geo tree long term with alternative names - why? why not?
120
+ words = ['Provincia']
121
+ words.each { |word| heading = heading.gsub( word, '' ) }
122
+ heading = heading.strip
123
+
124
+ headings.push( heading )
125
+ end
126
+
127
+ ## assert that hierarchy level is ok
128
+ if headings.size != heading_level
129
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
+ exit 1
131
+ end
132
+ end
133
+
134
+ pp headings
135
+
136
+ elsif node[0] == :p ## paragraph with (text) lines
137
+ lines = node[1]
138
+ lines.each do |line|
139
+ if line.start_with?( '|' )
140
+ ## assume continuation with line of alternative names
141
+ ## note: skip leading pipe
142
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
+
144
+ add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
+
146
+ ## check for b (child) team / club marker e.g.
147
+ ## (ii) or ii) or ii.) or (ii.)
148
+ ## (b) or b) or b.) or (b.)
149
+ ## (2) or 2) or 2.) or (2.)
150
+ elsif line =~ B_TEAM_MARKER_RE
151
+ line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
+
153
+ ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
+ # for both a and b team / club
155
+ rec = Club.new
156
+ value = line ## note: assume / allow just canonical name for now
157
+ ## strip and squish (white)spaces
158
+ # e.g. New York FC (2011-) => New York FC (2011-)
159
+ value = value.gsub( '$', '' ).strip
160
+ .gsub( /[ \t]+/, ' ' )
161
+
162
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
163
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
+
165
+ ### link a and b team / clubs
166
+ ## assume last_rec is the a team
167
+ ## todo/fix: check last_rec required NOT null
168
+ rec.a = last_rec
169
+ last_rec.b = rec
170
+
171
+ last_rec = rec
172
+ recs << rec
173
+
174
+ ## check for address line e.g.
175
+ ## use just one style / syntax - why? why not?
176
+ ## Fischhofgasse 12 ~ 1100 Wien or
177
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
+ elsif line =~ ADDR_MARKER_RE
180
+ # note skip for now!!!
181
+ # todo/fix: add support for address line!!!
182
+ puts " skipping address line for now >#{line}<"
183
+ else
184
+ values = line.split( ',' )
185
+
186
+ rec = Club.new
187
+
188
+ col = values.shift ## get first item
189
+ ## note: allow optional alt names for convenience with required canoncial name
190
+ names = col.split( '|' ) # team names - allow/use pipe(|)
191
+ value = names[0] ## canonical name
192
+ alt_names = names[1..-1] ## optional (inline) alt names
193
+
194
+ ## strip and squish (white)spaces
195
+ # e.g. New York FC (2011-) => New York FC (2011-)
196
+ value = value.gsub( '$', '' ).strip
197
+ .gsub( /[ \t]+/, ' ' )
198
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
199
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
+
201
+ ## note: add optional (inline) alternate names if present
202
+ add_alt_names( rec, alt_names ) if alt_names.size > 0
203
+
204
+ ## note:
205
+ ## check/todo!!!!!!!!!!!!!!!!!-
206
+ ## strip year if to present e.g. (2011-)
207
+ ##
208
+ ## do NOT strip for defunct / historic clubs e.g.
209
+ ## (1899-1910)
210
+ ## or (-1914) or (-2011) etc.
211
+
212
+ ###
213
+ ## todo: move year out of canonical team name - why? why not?
214
+
215
+ ## check if canonical name include (2011-) or similar in name
216
+ ## if yes, remove (2011-) and add to (alt) names
217
+ ## e.g. New York FC (2011) => New York FC
218
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
+
221
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
+ rec.year = $1.to_i
223
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
+ rec.year_end = $1.to_i
225
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
+ rec.year = $1.to_i
227
+ rec.year_end = $2.to_i
228
+ else
229
+ ## todo/check: warn about unknown year format
230
+ end
231
+ end
232
+
233
+ ## todo/check - check for unknown format values
234
+ ## e.g. too many values, duplicate years, etc.
235
+ ## check for overwritting, etc.
236
+ while values.size > 0
237
+ value = values.shift
238
+ ## strip and squish (white)spaces
239
+ # e.g. León › Guanajuato => León › Guanajuato
240
+ value = value.strip.gsub( /[ \t]+/, ' ' )
241
+ if value =~/^\d{4}$/ # e.g 1904
242
+ ## todo/check: issue warning if year is already set!!!!!!!
243
+ if rec.year
244
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
+ pp rec
246
+ exit 1
247
+ end
248
+ rec.year = value.to_i
249
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
250
+ ## cut-off leading @ and spaces
251
+ rec.ground = value[1..-1].strip
252
+ else
253
+ ## assume city / geo tree
254
+ ## split into geo tree
255
+ geos = split_geo( value )
256
+ city = geos[0]
257
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
+ rec.district = $1.strip
260
+ city = city.gsub( /\(.+?\)/, '' ).strip
261
+ end
262
+ rec.city = city
263
+
264
+ if geos.size > 1
265
+ ## cut-off city and keep the rest (of geo tree)
266
+ rec.geos = geos[1..-1]
267
+ end
268
+ end
269
+ end ## while values
270
+
271
+
272
+ ###############
273
+ ## use headings text for geo tree
274
+
275
+ ## 1) add country if present
276
+ if headings.size > 0 && headings[0]
277
+ country = catalog.countries.find( headings[0] )
278
+ rec.country = country
279
+ else
280
+ ## make it an error - why? why not?
281
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
+ exit 1
283
+ end
284
+
285
+ ## 2) check geo tree with headings hierarchy
286
+ if headings.size > 1 && headings[1]
287
+ geos = split_geo( headings[1] )
288
+ if rec.geos
289
+ if rec.geos[0] != geos[0]
290
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
+ exit 1
292
+ end
293
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
+ exit 1
296
+ end
297
+ else
298
+ ## add missing region (state/province) from headings hierarchy
299
+ rec.geos = geos
300
+ end
301
+ end
302
+
303
+ last_rec = rec
304
+
305
+
306
+ ### todo/fix:
307
+ ## auto-add alt name with dots stripped - why? why not?
308
+ ## e.g. D.C. United => DC United
309
+ ## e.g. Liverpool F.C. => Liverpool FC
310
+ ## e.g. St. Albin => St Albin etc.
311
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
+
313
+ ##
314
+ ## todo/fix: unify mapping entries
315
+ ## always lowercase !!!! (case insensitive)
316
+ ## always strip (2011-) !!!
317
+ ## always strip dots (e.g. St., F.C, etc.)
318
+
319
+ recs << rec
320
+ end
321
+ end # each line (in paragraph)
322
+ else
323
+ puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
+ pp node
325
+ exit 1
326
+ end
327
+ end
328
+
329
+ recs
330
+ end # method read
331
+
332
+ #######################
333
+ ### helpers
334
+ def split_geo( str )
335
+ ## assume city / geo tree
336
+ ## strip and squish (white)spaces
337
+ # e.g. León › Guanajuato => León › Guanajuato
338
+ str = str.strip.gsub( /[ \t]+/, ' ' )
339
+
340
+ ## split into geo tree
341
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
+ geos
344
+ end
345
+
346
+ end # class ClubReader
347
+
348
+
349
+ end ## module Import
350
+ end ## module SportDb