sportdb-formats 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,134 +1,134 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class ClubHistoryIndex
8
-
9
- def self.build( path )
10
- pack = Package.new( path ) ## lets us use direcotry or zip archive
11
-
12
- recs = []
13
- pack.each_clubs_history do |entry|
14
- recs += ClubHistoryReader.parse( entry.read )
15
- end
16
- recs
17
-
18
- index = new
19
- index.add( recs )
20
- index
21
- end
22
-
23
-
24
-
25
- def catalog() Import.catalog; end
26
-
27
- ## note: keep name history for now separate from
28
- ## from club struct - why? why not?
29
- ## later yes, yes, yes, merge name history into club struct!!!!!
30
- ##
31
- ## for now the name history is experimental
32
-
33
-
34
- def initialize
35
- @clubs = {} ## clubs (indexed) by canonical name
36
- @errors = []
37
- end
38
-
39
- attr_reader :errors
40
- def errors?() @errors.empty? == false; end
41
-
42
- def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
43
-
44
-
45
- def add_history( club_rec, keyword, season, args )
46
- ## note use season obj for now (and NOT key) - why? why not?
47
- rec = @clubs[ club_rec.name ] ||= []
48
-
49
- rec << [season, [keyword, args]]
50
-
51
- ## note: always keep records sorted by season_key for now
52
- ## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
53
- rec.sort! { |l,r| r[0] <=> l[0] }
54
- end
55
-
56
-
57
- def add( rec_or_recs ) ## add club record / alt_names
58
- recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
59
-
60
- recs.each do |rec|
61
-
62
- keyword = rec[0]
63
- season_key = rec[1]
64
- args = rec[2..-1] ## get rest of args e.g. one, two or more
65
-
66
- ## note: for now only add (re)name history season records,
67
- ## that is, skip MERGE and BANKRUPT for now
68
- ## and incl. only RENAME, REFORM, MOVE for now
69
- next if ['MERGE', 'BANKRUPT'].include?( keyword )
70
-
71
-
72
- name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
73
- name_new = strip_geo( args[1][0] )
74
-
75
- country_old = args[0][1]
76
- country_new = args[1][1]
77
-
78
- club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
79
- club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
80
-
81
- ## note use season obj for now (and NOT key) - why? why not?
82
- season = Season.parse( season_key )
83
-
84
- ## todo/check:
85
- ## check if club_old and club_new reference different club record!!
86
- ## examples - RB II -> Liefering ?? or
87
- ## FC Pasching -> OOE Juniors ??
88
- ## Austria Salzburg -> RB Salburg ??
89
- ## for now always add name history to both - why? why not?
90
-
91
- add_history( club_old, keyword, season, args )
92
- ## note: allow for now different club references
93
- ## but maybe warn later - why? why not?
94
- ## add history to both for now
95
- add_history( club_new, keyword, season, args ) if club_old != club_new
96
- end # each rec
97
- end # method add
98
-
99
-
100
- #### todo/check: move as method to club struct later - to always use club reference
101
- ## returns (simply) name as string for now or nil - why? why not?
102
- #
103
- # history entry example
104
- # Arsenal FC"=>
105
- # [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
106
- # [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
107
- # [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
108
- def find_name_by( name:, season: )
109
- recs = @clubs[ name ]
110
- if recs
111
- season = Season( season ) ## make sure season is a season obj (and NOT a string)
112
- ## check season records for name; use linear search (assume only few records)
113
- recs.each do |rec|
114
- if season >= rec[0]
115
- return strip_geo( rec[1][1][1][0] ) # use second arg
116
- end
117
- end
118
- ## if we get here use last name
119
- strip_geo( recs[-1][1][1][0][0] ) # use first arg
120
- else
121
- nil
122
- end
123
- end
124
-
125
- ##################
126
- ## helpers
127
- def strip_geo( name )
128
- ## e.g. Arsenal, London => Arsenal
129
- name.split(',')[0].strip
130
- end
131
- end # class ClubHistoryIndex
132
-
133
- end # module Import
134
- end # module SportDb
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class ClubHistoryIndex
8
+
9
+ def self.build( path )
10
+ pack = Package.new( path ) ## lets us use direcotry or zip archive
11
+
12
+ recs = []
13
+ pack.each_clubs_history do |entry|
14
+ recs += ClubHistoryReader.parse( entry.read )
15
+ end
16
+ recs
17
+
18
+ index = new
19
+ index.add( recs )
20
+ index
21
+ end
22
+
23
+
24
+
25
+ def catalog() Import.catalog; end
26
+
27
+ ## note: keep name history for now separate from
28
+ ## from club struct - why? why not?
29
+ ## later yes, yes, yes, merge name history into club struct!!!!!
30
+ ##
31
+ ## for now the name history is experimental
32
+
33
+
34
+ def initialize
35
+ @clubs = {} ## clubs (indexed) by canonical name
36
+ @errors = []
37
+ end
38
+
39
+ attr_reader :errors
40
+ def errors?() @errors.empty? == false; end
41
+
42
+ def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
43
+
44
+
45
+ def add_history( club_rec, keyword, season, args )
46
+ ## note use season obj for now (and NOT key) - why? why not?
47
+ rec = @clubs[ club_rec.name ] ||= []
48
+
49
+ rec << [season, [keyword, args]]
50
+
51
+ ## note: always keep records sorted by season_key for now
52
+ ## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
53
+ rec.sort! { |l,r| r[0] <=> l[0] }
54
+ end
55
+
56
+
57
+ def add( rec_or_recs ) ## add club record / alt_names
58
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
59
+
60
+ recs.each do |rec|
61
+
62
+ keyword = rec[0]
63
+ season_key = rec[1]
64
+ args = rec[2..-1] ## get rest of args e.g. one, two or more
65
+
66
+ ## note: for now only add (re)name history season records,
67
+ ## that is, skip MERGE and BANKRUPT for now
68
+ ## and incl. only RENAME, REFORM, MOVE for now
69
+ next if ['MERGE', 'BANKRUPT'].include?( keyword )
70
+
71
+
72
+ name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
73
+ name_new = strip_geo( args[1][0] )
74
+
75
+ country_old = args[0][1]
76
+ country_new = args[1][1]
77
+
78
+ club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
79
+ club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
80
+
81
+ ## note use season obj for now (and NOT key) - why? why not?
82
+ season = Season.parse( season_key )
83
+
84
+ ## todo/check:
85
+ ## check if club_old and club_new reference different club record!!
86
+ ## examples - RB II -> Liefering ?? or
87
+ ## FC Pasching -> OOE Juniors ??
88
+ ## Austria Salzburg -> RB Salburg ??
89
+ ## for now always add name history to both - why? why not?
90
+
91
+ add_history( club_old, keyword, season, args )
92
+ ## note: allow for now different club references
93
+ ## but maybe warn later - why? why not?
94
+ ## add history to both for now
95
+ add_history( club_new, keyword, season, args ) if club_old != club_new
96
+ end # each rec
97
+ end # method add
98
+
99
+
100
+ #### todo/check: move as method to club struct later - to always use club reference
101
+ ## returns (simply) name as string for now or nil - why? why not?
102
+ #
103
+ # history entry example
104
+ # Arsenal FC"=>
105
+ # [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
106
+ # [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
107
+ # [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
108
+ def find_name_by( name:, season: )
109
+ recs = @clubs[ name ]
110
+ if recs
111
+ season = Season( season ) ## make sure season is a season obj (and NOT a string)
112
+ ## check season records for name; use linear search (assume only few records)
113
+ recs.each do |rec|
114
+ if season >= rec[0]
115
+ return strip_geo( rec[1][1][1][0] ) # use second arg
116
+ end
117
+ end
118
+ ## if we get here use last name
119
+ strip_geo( recs[-1][1][1][0][0] ) # use first arg
120
+ else
121
+ nil
122
+ end
123
+ end
124
+
125
+ ##################
126
+ ## helpers
127
+ def strip_geo( name )
128
+ ## e.g. Arsenal, London => Arsenal
129
+ name.split(',')[0].strip
130
+ end
131
+ end # class ClubHistoryIndex
132
+
133
+ end # module Import
134
+ end # module SportDb
@@ -1,350 +1,350 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
- def catalog() Import.catalog; end
11
-
12
-
13
-
14
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
- parse( txt )
17
- end
18
-
19
- def self.parse( txt )
20
- new( txt ).parse
21
- end
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
- ## pattern for b (child) team / club marker e.g.
28
- ## (ii) or ii) or ii.) or (ii.) or (II)
29
- ## (b) or b) or b.) or (b.) or (B)
30
- ## (2) or 2) or 2.) or (2.)
31
- B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
- (?: ii|b|2 )
33
- \.? # optional dot - keep and allow dot - why? why not?
34
- \) # required closing bracket
35
- }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
-
37
- ## pattern for checking for address line e.g.
38
- ## use just one style / syntax - why? why not?
39
- ## Fischhofgasse 12 ~ 1100 Wien or
40
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
- (?: ~ | /{2,} | \+{2,} )
44
- (?: [ ]|$) # space or end of line
45
- }x
46
-
47
-
48
- def add_alt_names( rec, names ) ## helper for adding alternat names
49
-
50
- ## strip and squish (white)spaces
51
- # e.g. New York FC (2011-) => New York FC (2011-)
52
- names = names.map { |name| name.gsub( '$', '' ).strip
53
- .gsub( /[ \t]+/, ' ' ) }
54
- rec.alt_names += names
55
- rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
-
57
- ## check for duplicates
58
- if rec.duplicates?
59
- duplicates = rec.duplicates
60
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
- pp duplicates
62
- pp rec
63
- ##
64
- ## todo/fix: make it only an error with exit 1
65
- ## if (not normalized) names are the same (not unique/uniq)
66
- ## e.g. don't exit on A.F.C. == AFC etc.
67
- ## exit 1
68
- end
69
- end
70
-
71
-
72
- def parse
73
- recs = []
74
- last_rec = nil
75
- headings = [] ## headings stack
76
-
77
- OutlineReader.parse( @txt ).each do |node|
78
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
- heading_level = node[0][1].to_i
80
- heading = node[1]
81
-
82
- puts "heading #{heading_level} >#{heading}<"
83
-
84
- ## 1) first pop headings if present
85
- while headings.size+1 > heading_level
86
- headings.pop
87
- end
88
-
89
- ## 2) add missing (hierarchy) level if
90
- while headings.size+1 < heading_level
91
- ## todo/fix: issue warning about "skipping" hierarchy level
92
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
- headings.push( nil )
94
- end
95
-
96
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
- ## keep level empty
98
- else
99
- ## note: if level is 1 assume country for now
100
- if heading_level == 1
101
- ## assume country in heading; allow all "formats" supported by parse e.g.
102
- ## Österreich • Austria (at)
103
- ## Österreich • Austria
104
- ## Austria
105
- ## Deutschland (de) • Germany
106
- country = catalog.countries.parse( heading )
107
- ## check country code - MUST exist for now!!!!
108
- if country.nil?
109
- puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
- exit 1
111
- end
112
-
113
- headings.push( country.key )
114
- else
115
- ## quick hack:
116
- ## remove known fill/dummy words incl:
117
- ## Provincia San Juan => San Juan (see argentina, for example)
118
- ##
119
- ## use geo tree long term with alternative names - why? why not?
120
- words = ['Provincia']
121
- words.each { |word| heading = heading.gsub( word, '' ) }
122
- heading = heading.strip
123
-
124
- headings.push( heading )
125
- end
126
-
127
- ## assert that hierarchy level is ok
128
- if headings.size != heading_level
129
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
- exit 1
131
- end
132
- end
133
-
134
- pp headings
135
-
136
- elsif node[0] == :p ## paragraph with (text) lines
137
- lines = node[1]
138
- lines.each do |line|
139
- if line.start_with?( '|' )
140
- ## assume continuation with line of alternative names
141
- ## note: skip leading pipe
142
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
-
144
- add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
-
146
- ## check for b (child) team / club marker e.g.
147
- ## (ii) or ii) or ii.) or (ii.)
148
- ## (b) or b) or b.) or (b.)
149
- ## (2) or 2) or 2.) or (2.)
150
- elsif line =~ B_TEAM_MARKER_RE
151
- line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
-
153
- ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
- # for both a and b team / club
155
- rec = Club.new
156
- value = line ## note: assume / allow just canonical name for now
157
- ## strip and squish (white)spaces
158
- # e.g. New York FC (2011-) => New York FC (2011-)
159
- value = value.gsub( '$', '' ).strip
160
- .gsub( /[ \t]+/, ' ' )
161
-
162
- rec.name = value # canoncial name (global unique "beautiful/long" name)
163
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
-
165
- ### link a and b team / clubs
166
- ## assume last_rec is the a team
167
- ## todo/fix: check last_rec required NOT null
168
- rec.a = last_rec
169
- last_rec.b = rec
170
-
171
- last_rec = rec
172
- recs << rec
173
-
174
- ## check for address line e.g.
175
- ## use just one style / syntax - why? why not?
176
- ## Fischhofgasse 12 ~ 1100 Wien or
177
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
- elsif line =~ ADDR_MARKER_RE
180
- # note skip for now!!!
181
- # todo/fix: add support for address line!!!
182
- puts " skipping address line for now >#{line}<"
183
- else
184
- values = line.split( ',' )
185
-
186
- rec = Club.new
187
-
188
- col = values.shift ## get first item
189
- ## note: allow optional alt names for convenience with required canoncial name
190
- names = col.split( '|' ) # team names - allow/use pipe(|)
191
- value = names[0] ## canonical name
192
- alt_names = names[1..-1] ## optional (inline) alt names
193
-
194
- ## strip and squish (white)spaces
195
- # e.g. New York FC (2011-) => New York FC (2011-)
196
- value = value.gsub( '$', '' ).strip
197
- .gsub( /[ \t]+/, ' ' )
198
- rec.name = value # canoncial name (global unique "beautiful/long" name)
199
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
-
201
- ## note: add optional (inline) alternate names if present
202
- add_alt_names( rec, alt_names ) if alt_names.size > 0
203
-
204
- ## note:
205
- ## check/todo!!!!!!!!!!!!!!!!!-
206
- ## strip year if to present e.g. (2011-)
207
- ##
208
- ## do NOT strip for defunct / historic clubs e.g.
209
- ## (1899-1910)
210
- ## or (-1914) or (-2011) etc.
211
-
212
- ###
213
- ## todo: move year out of canonical team name - why? why not?
214
-
215
- ## check if canonical name include (2011-) or similar in name
216
- ## if yes, remove (2011-) and add to (alt) names
217
- ## e.g. New York FC (2011) => New York FC
218
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
-
221
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
- rec.year = $1.to_i
223
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
- rec.year_end = $1.to_i
225
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
- rec.year = $1.to_i
227
- rec.year_end = $2.to_i
228
- else
229
- ## todo/check: warn about unknown year format
230
- end
231
- end
232
-
233
- ## todo/check - check for unknown format values
234
- ## e.g. too many values, duplicate years, etc.
235
- ## check for overwritting, etc.
236
- while values.size > 0
237
- value = values.shift
238
- ## strip and squish (white)spaces
239
- # e.g. León › Guanajuato => León › Guanajuato
240
- value = value.strip.gsub( /[ \t]+/, ' ' )
241
- if value =~/^\d{4}$/ # e.g 1904
242
- ## todo/check: issue warning if year is already set!!!!!!!
243
- if rec.year
244
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
- pp rec
246
- exit 1
247
- end
248
- rec.year = value.to_i
249
- elsif value.start_with?( '@' ) # e.g. @ Anfield
250
- ## cut-off leading @ and spaces
251
- rec.ground = value[1..-1].strip
252
- else
253
- ## assume city / geo tree
254
- ## split into geo tree
255
- geos = split_geo( value )
256
- city = geos[0]
257
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
- rec.district = $1.strip
260
- city = city.gsub( /\(.+?\)/, '' ).strip
261
- end
262
- rec.city = city
263
-
264
- if geos.size > 1
265
- ## cut-off city and keep the rest (of geo tree)
266
- rec.geos = geos[1..-1]
267
- end
268
- end
269
- end ## while values
270
-
271
-
272
- ###############
273
- ## use headings text for geo tree
274
-
275
- ## 1) add country if present
276
- if headings.size > 0 && headings[0]
277
- country = catalog.countries.find( headings[0] )
278
- rec.country = country
279
- else
280
- ## make it an error - why? why not?
281
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
- exit 1
283
- end
284
-
285
- ## 2) check geo tree with headings hierarchy
286
- if headings.size > 1 && headings[1]
287
- geos = split_geo( headings[1] )
288
- if rec.geos
289
- if rec.geos[0] != geos[0]
290
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
- exit 1
292
- end
293
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
- exit 1
296
- end
297
- else
298
- ## add missing region (state/province) from headings hierarchy
299
- rec.geos = geos
300
- end
301
- end
302
-
303
- last_rec = rec
304
-
305
-
306
- ### todo/fix:
307
- ## auto-add alt name with dots stripped - why? why not?
308
- ## e.g. D.C. United => DC United
309
- ## e.g. Liverpool F.C. => Liverpool FC
310
- ## e.g. St. Albin => St Albin etc.
311
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
-
313
- ##
314
- ## todo/fix: unify mapping entries
315
- ## always lowercase !!!! (case insensitive)
316
- ## always strip (2011-) !!!
317
- ## always strip dots (e.g. St., F.C, etc.)
318
-
319
- recs << rec
320
- end
321
- end # each line (in paragraph)
322
- else
323
- puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
- pp node
325
- exit 1
326
- end
327
- end
328
-
329
- recs
330
- end # method read
331
-
332
- #######################
333
- ### helpers
334
- def split_geo( str )
335
- ## assume city / geo tree
336
- ## strip and squish (white)spaces
337
- # e.g. León › Guanajuato => León › Guanajuato
338
- str = str.strip.gsub( /[ \t]+/, ' ' )
339
-
340
- ## split into geo tree
341
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
- geos
344
- end
345
-
346
- end # class ClubReader
347
-
348
-
349
- end ## module Import
350
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+
14
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
+ parse( txt )
17
+ end
18
+
19
+ def self.parse( txt )
20
+ new( txt ).parse
21
+ end
22
+
23
+ def initialize( txt )
24
+ @txt = txt
25
+ end
26
+
27
+ ## pattern for b (child) team / club marker e.g.
28
+ ## (ii) or ii) or ii.) or (ii.) or (II)
29
+ ## (b) or b) or b.) or (b.) or (B)
30
+ ## (2) or 2) or 2.) or (2.)
31
+ B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
+ (?: ii|b|2 )
33
+ \.? # optional dot - keep and allow dot - why? why not?
34
+ \) # required closing bracket
35
+ }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
+
37
+ ## pattern for checking for address line e.g.
38
+ ## use just one style / syntax - why? why not?
39
+ ## Fischhofgasse 12 ~ 1100 Wien or
40
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
+ ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
+ (?: ~ | /{2,} | \+{2,} )
44
+ (?: [ ]|$) # space or end of line
45
+ }x
46
+
47
+
48
+ def add_alt_names( rec, names ) ## helper for adding alternat names
49
+
50
+ ## strip and squish (white)spaces
51
+ # e.g. New York FC (2011-) => New York FC (2011-)
52
+ names = names.map { |name| name.gsub( '$', '' ).strip
53
+ .gsub( /[ \t]+/, ' ' ) }
54
+ rec.alt_names += names
55
+ rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
+
57
+ ## check for duplicates
58
+ if rec.duplicates?
59
+ duplicates = rec.duplicates
60
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
+ pp duplicates
62
+ pp rec
63
+ ##
64
+ ## todo/fix: make it only an error with exit 1
65
+ ## if (not normalized) names are the same (not unique/uniq)
66
+ ## e.g. don't exit on A.F.C. == AFC etc.
67
+ ## exit 1
68
+ end
69
+ end
70
+
71
+
72
+ def parse
73
+ recs = []
74
+ last_rec = nil
75
+ headings = [] ## headings stack
76
+
77
+ OutlineReader.parse( @txt ).each do |node|
78
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
+ heading_level = node[0][1].to_i
80
+ heading = node[1]
81
+
82
+ puts "heading #{heading_level} >#{heading}<"
83
+
84
+ ## 1) first pop headings if present
85
+ while headings.size+1 > heading_level
86
+ headings.pop
87
+ end
88
+
89
+ ## 2) add missing (hierarchy) level if
90
+ while headings.size+1 < heading_level
91
+ ## todo/fix: issue warning about "skipping" hierarchy level
92
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
+ headings.push( nil )
94
+ end
95
+
96
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
+ ## keep level empty
98
+ else
99
+ ## note: if level is 1 assume country for now
100
+ if heading_level == 1
101
+ ## assume country in heading; allow all "formats" supported by parse e.g.
102
+ ## Österreich • Austria (at)
103
+ ## Österreich • Austria
104
+ ## Austria
105
+ ## Deutschland (de) • Germany
106
+ country = catalog.countries.parse( heading )
107
+ ## check country code - MUST exist for now!!!!
108
+ if country.nil?
109
+ puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
+ exit 1
111
+ end
112
+
113
+ headings.push( country.key )
114
+ else
115
+ ## quick hack:
116
+ ## remove known fill/dummy words incl:
117
+ ## Provincia San Juan => San Juan (see argentina, for example)
118
+ ##
119
+ ## use geo tree long term with alternative names - why? why not?
120
+ words = ['Provincia']
121
+ words.each { |word| heading = heading.gsub( word, '' ) }
122
+ heading = heading.strip
123
+
124
+ headings.push( heading )
125
+ end
126
+
127
+ ## assert that hierarchy level is ok
128
+ if headings.size != heading_level
129
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
+ exit 1
131
+ end
132
+ end
133
+
134
+ pp headings
135
+
136
+ elsif node[0] == :p ## paragraph with (text) lines
137
+ lines = node[1]
138
+ lines.each do |line|
139
+ if line.start_with?( '|' )
140
+ ## assume continuation with line of alternative names
141
+ ## note: skip leading pipe
142
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
+
144
+ add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
+
146
+ ## check for b (child) team / club marker e.g.
147
+ ## (ii) or ii) or ii.) or (ii.)
148
+ ## (b) or b) or b.) or (b.)
149
+ ## (2) or 2) or 2.) or (2.)
150
+ elsif line =~ B_TEAM_MARKER_RE
151
+ line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
+
153
+ ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
+ # for both a and b team / club
155
+ rec = Club.new
156
+ value = line ## note: assume / allow just canonical name for now
157
+ ## strip and squish (white)spaces
158
+ # e.g. New York FC (2011-) => New York FC (2011-)
159
+ value = value.gsub( '$', '' ).strip
160
+ .gsub( /[ \t]+/, ' ' )
161
+
162
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
163
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
+
165
+ ### link a and b team / clubs
166
+ ## assume last_rec is the a team
167
+ ## todo/fix: check last_rec required NOT null
168
+ rec.a = last_rec
169
+ last_rec.b = rec
170
+
171
+ last_rec = rec
172
+ recs << rec
173
+
174
+ ## check for address line e.g.
175
+ ## use just one style / syntax - why? why not?
176
+ ## Fischhofgasse 12 ~ 1100 Wien or
177
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
+ elsif line =~ ADDR_MARKER_RE
180
+ # note skip for now!!!
181
+ # todo/fix: add support for address line!!!
182
+ puts " skipping address line for now >#{line}<"
183
+ else
184
+ values = line.split( ',' )
185
+
186
+ rec = Club.new
187
+
188
+ col = values.shift ## get first item
189
+ ## note: allow optional alt names for convenience with required canoncial name
190
+ names = col.split( '|' ) # team names - allow/use pipe(|)
191
+ value = names[0] ## canonical name
192
+ alt_names = names[1..-1] ## optional (inline) alt names
193
+
194
+ ## strip and squish (white)spaces
195
+ # e.g. New York FC (2011-) => New York FC (2011-)
196
+ value = value.gsub( '$', '' ).strip
197
+ .gsub( /[ \t]+/, ' ' )
198
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
199
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
+
201
+ ## note: add optional (inline) alternate names if present
202
+ add_alt_names( rec, alt_names ) if alt_names.size > 0
203
+
204
+ ## note:
205
+ ## check/todo!!!!!!!!!!!!!!!!!-
206
+ ## strip year if to present e.g. (2011-)
207
+ ##
208
+ ## do NOT strip for defunct / historic clubs e.g.
209
+ ## (1899-1910)
210
+ ## or (-1914) or (-2011) etc.
211
+
212
+ ###
213
+ ## todo: move year out of canonical team name - why? why not?
214
+
215
+ ## check if canonical name include (2011-) or similar in name
216
+ ## if yes, remove (2011-) and add to (alt) names
217
+ ## e.g. New York FC (2011) => New York FC
218
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
+
221
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
+ rec.year = $1.to_i
223
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
+ rec.year_end = $1.to_i
225
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
+ rec.year = $1.to_i
227
+ rec.year_end = $2.to_i
228
+ else
229
+ ## todo/check: warn about unknown year format
230
+ end
231
+ end
232
+
233
+ ## todo/check - check for unknown format values
234
+ ## e.g. too many values, duplicate years, etc.
235
+ ## check for overwritting, etc.
236
+ while values.size > 0
237
+ value = values.shift
238
+ ## strip and squish (white)spaces
239
+ # e.g. León › Guanajuato => León › Guanajuato
240
+ value = value.strip.gsub( /[ \t]+/, ' ' )
241
+ if value =~/^\d{4}$/ # e.g 1904
242
+ ## todo/check: issue warning if year is already set!!!!!!!
243
+ if rec.year
244
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
+ pp rec
246
+ exit 1
247
+ end
248
+ rec.year = value.to_i
249
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
250
+ ## cut-off leading @ and spaces
251
+ rec.ground = value[1..-1].strip
252
+ else
253
+ ## assume city / geo tree
254
+ ## split into geo tree
255
+ geos = split_geo( value )
256
+ city = geos[0]
257
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
+ rec.district = $1.strip
260
+ city = city.gsub( /\(.+?\)/, '' ).strip
261
+ end
262
+ rec.city = city
263
+
264
+ if geos.size > 1
265
+ ## cut-off city and keep the rest (of geo tree)
266
+ rec.geos = geos[1..-1]
267
+ end
268
+ end
269
+ end ## while values
270
+
271
+
272
+ ###############
273
+ ## use headings text for geo tree
274
+
275
+ ## 1) add country if present
276
+ if headings.size > 0 && headings[0]
277
+ country = catalog.countries.find( headings[0] )
278
+ rec.country = country
279
+ else
280
+ ## make it an error - why? why not?
281
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
+ exit 1
283
+ end
284
+
285
+ ## 2) check geo tree with headings hierarchy
286
+ if headings.size > 1 && headings[1]
287
+ geos = split_geo( headings[1] )
288
+ if rec.geos
289
+ if rec.geos[0] != geos[0]
290
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
+ exit 1
292
+ end
293
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
+ exit 1
296
+ end
297
+ else
298
+ ## add missing region (state/province) from headings hierarchy
299
+ rec.geos = geos
300
+ end
301
+ end
302
+
303
+ last_rec = rec
304
+
305
+
306
+ ### todo/fix:
307
+ ## auto-add alt name with dots stripped - why? why not?
308
+ ## e.g. D.C. United => DC United
309
+ ## e.g. Liverpool F.C. => Liverpool FC
310
+ ## e.g. St. Albin => St Albin etc.
311
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
+
313
+ ##
314
+ ## todo/fix: unify mapping entries
315
+ ## always lowercase !!!! (case insensitive)
316
+ ## always strip (2011-) !!!
317
+ ## always strip dots (e.g. St., F.C, etc.)
318
+
319
+ recs << rec
320
+ end
321
+ end # each line (in paragraph)
322
+ else
323
+ puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
+ pp node
325
+ exit 1
326
+ end
327
+ end
328
+
329
+ recs
330
+ end # method read
331
+
332
+ #######################
333
+ ### helpers
334
+ def split_geo( str )
335
+ ## assume city / geo tree
336
+ ## strip and squish (white)spaces
337
+ # e.g. León › Guanajuato => León › Guanajuato
338
+ str = str.strip.gsub( /[ \t]+/, ' ' )
339
+
340
+ ## split into geo tree
341
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
+ geos
344
+ end
345
+
346
+ end # class ClubReader
347
+
348
+
349
+ end ## module Import
350
+ end ## module SportDb