sportdb-formats 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,318 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
-
5
-
6
- class ClubReader
7
-
8
- def world() Import.world; end
9
-
10
-
11
-
12
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
13
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
14
- parse( txt )
15
- end
16
-
17
- def self.parse( txt )
18
- new( txt ).parse
19
- end
20
-
21
- def initialize( txt )
22
- @txt = txt
23
- end
24
-
25
- ## pattern for b (child) team / club marker e.g.
26
- ## (ii) or ii) or ii.) or (ii.) or (II)
27
- ## (b) or b) or b.) or (b.) or (B)
28
- ## (2) or 2) or 2.) or (2.)
29
- B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
30
- (?: ii|b|2 )
31
- \.? # optional dot - keep and allow dot - why? why not?
32
- \) # required closing bracket
33
- }xi ## note: add case-insenstive (e.g. II/ii or B/b)
34
-
35
- ## pattern for checking for address line e.g.
36
- ## use just one style / syntax - why? why not?
37
- ## Fischhofgasse 12 ~ 1100 Wien or
38
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
39
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
40
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
41
- (?: ~ | /{2,} | \+{2,} )
42
- (?: [ ]|$) # space or end of line
43
- }x
44
-
45
-
46
-
47
- def parse
48
- recs = []
49
- last_rec = nil
50
- headings = [] ## headings stack
51
-
52
- OutlineReader.parse( @txt ).each do |node|
53
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
54
- heading_level = node[0][1].to_i
55
- heading = node[1]
56
-
57
- puts "heading #{heading_level} >#{heading}<"
58
-
59
- ## 1) first pop headings if present
60
- while headings.size+1 > heading_level
61
- headings.pop
62
- end
63
-
64
- ## 2) add missing (hierarchy) level if
65
- while headings.size+1 < heading_level
66
- ## todo/fix: issue warning about "skipping" hierarchy level
67
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
68
- headings.push( nil )
69
- end
70
-
71
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
72
- ## keep level empty
73
- else
74
- ## note: if level is 1 assume country for now
75
- if heading_level == 1
76
- ## assume country in heading; allow all "formats" supported by parse e.g.
77
- ## Österreich • Austria (at)
78
- ## Österreich • Austria
79
- ## Austria
80
- ## Deutschland (de) • Germany
81
- country = world.countries.parse( heading )
82
- ## check country code - MUST exist for now!!!!
83
- if country.nil?
84
- puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
85
- exit 1
86
- end
87
-
88
- headings.push( country.key )
89
- else
90
- ## quick hack:
91
- ## remove known fill/dummy words incl:
92
- ## Provincia San Juan => San Juan (see argentina, for example)
93
- ##
94
- ## use geo tree long term with alternative names - why? why not?
95
- words = ['Provincia']
96
- words.each { |word| heading = heading.gsub( word, '' ) }
97
- heading = heading.strip
98
-
99
- headings.push( heading )
100
- end
101
-
102
- ## assert that hierarchy level is ok
103
- if headings.size != heading_level
104
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
105
- exit 1
106
- end
107
- end
108
-
109
- pp headings
110
-
111
- elsif node[0] == :p ## paragraph with (text) lines
112
- lines = node[1]
113
- lines.each do |line|
114
- if line.start_with?( '|' )
115
- ## assume continuation with line of alternative names
116
- ## note: skip leading pipe
117
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
118
- values = values.map {|value| _norm(value) } ## squish/strip etc.
119
-
120
- last_rec.alt_names += values
121
-
122
- ## check for b (child) team / club marker e.g.
123
- ## (ii) or ii) or ii.) or (ii.)
124
- ## (b) or b) or b.) or (b.)
125
- ## (2) or 2) or 2.) or (2.)
126
- elsif line =~ B_TEAM_MARKER_RE
127
- line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
128
-
129
- ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
130
- # for both a and b team / club
131
- rec = Club.new
132
- ## note: assume / allow just canonical name for now
133
- ## strip and squish (white)spaces
134
- # e.g. New York FC (2011-) => New York FC (2011-)
135
- rec.name = _norm( line ) # canoncial name (global unique "beautiful/long" name)
136
-
137
- ### link a and b team / clubs
138
- ## assume last_rec is the a team
139
- ## todo/fix: check last_rec required NOT null
140
- rec.a = last_rec
141
- last_rec.b = rec
142
-
143
- last_rec = rec
144
- recs << rec
145
-
146
- ## check for address line e.g.
147
- ## use just one style / syntax - why? why not?
148
- ## Fischhofgasse 12 ~ 1100 Wien or
149
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
150
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
151
- elsif line =~ ADDR_MARKER_RE
152
- last_rec.address = _squish( line )
153
- else
154
- values = line.split( ',' )
155
-
156
- rec = Club.new
157
-
158
- col = values.shift ## get first item
159
- ## note: allow optional alt names for convenience with required canoncial name
160
- names = col.split( '|' ) # team names - allow/use pipe(|)
161
- names = names.map {|name| _norm(name) } ## squish/strip etc.
162
-
163
- value = names[0] ## canonical name
164
- alt_names = names[1..-1] ## optional (inline) alt names
165
-
166
- rec.name = value # canoncial name (global unique "beautiful/long" name)
167
- ## note: add optional (inline) alternate names if present
168
- rec.alt_names += alt_names if alt_names.size > 0
169
-
170
- ## note:
171
- ## check/todo!!!!!!!!!!!!!!!!!-
172
- ## strip year if to present e.g. (2011-)
173
- ##
174
- ## do NOT strip for defunct / historic clubs e.g.
175
- ## (1899-1910)
176
- ## or (-1914) or (-2011) etc.
177
-
178
- ###
179
- ## todo: move year out of canonical team name - why? why not?
180
-
181
- ## check if canonical name include (2011-) or similar in name
182
- ## if yes, remove (2011-) and add to (alt) names
183
- ## e.g. New York FC (2011) => New York FC
184
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
185
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
186
-
187
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
188
- rec.year = $1.to_i
189
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
190
- rec.year_end = $1.to_i
191
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
192
- rec.year = $1.to_i
193
- rec.year_end = $2.to_i
194
- else
195
- ## todo/check: warn about unknown year format
196
- end
197
- end
198
-
199
- ## todo/check - check for unknown format values
200
- ## e.g. too many values, duplicate years, etc.
201
- ## check for overwritting, etc.
202
-
203
- ## strip and squish (white)spaces
204
- # e.g. León › Guanajuato => León › Guanajuato
205
- values = values.map {|value| _squish(value) }
206
-
207
- while values.size > 0
208
- value = values.shift
209
- if value =~/^\d{4}$/ # e.g 1904
210
- ## todo/check: issue warning if year is already set!!!!!!!
211
- if rec.year
212
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
213
- pp rec
214
- exit 1
215
- end
216
- rec.year = value.to_i
217
- elsif value.start_with?( '@' ) # e.g. @ Anfield
218
- ## cut-off leading @ and spaces
219
- rec.ground = value[1..-1].strip
220
- else
221
- ## assume city / geo tree
222
- ## split into geo tree
223
- geos = split_geo( value )
224
- city = geos[0]
225
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
226
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
227
- rec.district = $1.strip
228
- city = city.gsub( /\(.+?\)/, '' ).strip
229
- end
230
- rec.city = city
231
-
232
- if geos.size > 1
233
- ## cut-off city and keep the rest (of geo tree)
234
- rec.geos = geos[1..-1]
235
- end
236
- end
237
- end ## while values
238
-
239
-
240
- ###############
241
- ## use headings text for geo tree
242
-
243
- ## 1) add country if present
244
- if headings.size > 0 && headings[0]
245
- country = world.countries.find( headings[0] )
246
- rec.country = country
247
- else
248
- ## make it an error - why? why not?
249
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
250
- exit 1
251
- end
252
-
253
- ## 2) check geo tree with headings hierarchy
254
- if headings.size > 1 && headings[1]
255
- geos = split_geo( headings[1] )
256
- if rec.geos
257
- if rec.geos[0] != geos[0]
258
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
259
- exit 1
260
- end
261
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
262
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
263
- exit 1
264
- end
265
- else
266
- ## add missing region (state/province) from headings hierarchy
267
- rec.geos = geos
268
- end
269
- end
270
-
271
- last_rec = rec
272
-
273
- recs << rec
274
- end
275
- end # each line (in paragraph)
276
- else
277
- puts "** !!! ERROR !!! [club reader] - unknown line type:"
278
- pp node
279
- exit 1
280
- end
281
- end
282
-
283
- recs
284
- end # method read
285
-
286
- #######################
287
- ### helpers
288
- def split_geo( str )
289
- ## assume city / geo tree
290
- ## strip and squish (white)spaces
291
- # e.g. León › Guanajuato => León › Guanajuato
292
- str = _squish( str )
293
-
294
- ## split into geo tree
295
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
296
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
297
- geos
298
- end
299
-
300
-
301
- ## norm(alize) helper - squish (spaces)
302
- ## and remove dollars ($$$)
303
- ## and remove leading and trailing spaces
304
- def _norm( str )
305
- ## only extra clean-up of dollars for now ($$$)
306
- _squish( str.gsub( '$', '' ) )
307
- end
308
-
309
- def _squish( str )
310
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
311
- end
312
-
313
-
314
- end # class ClubReader
315
-
316
-
317
- end ## module Import
318
- end ## module SportDb
@@ -1,203 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubHistoryReader
9
-
10
- def world() Import.world; end
11
-
12
-
13
-
14
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
- parse( txt )
17
- end
18
-
19
- def self.parse( txt )
20
- new( txt ).parse
21
- end
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
-
28
- ###
29
- ## RENAME/RENAMED
30
- ## MOVE/MOVED
31
- ## BANKRUPT/BANKRUPTED
32
- ## REFORM/REFORMED
33
- ## MERGE/MERGED - allow + or ++ or +++ or ; for "inline" - why? why not?
34
-
35
-
36
- KEYWORD_LINE_RE = %r{ ^(?<keyword>RENAMED?|
37
- MOVED?|
38
- BANKRUPT(?:ED)?|
39
- REFORM(?:ED)?|
40
- MERGED?
41
- )
42
- [ ]+
43
- (?<text>.*) # rest of text
44
- $
45
- }x
46
-
47
-
48
- def parse
49
- recs = []
50
- last_rec = nil
51
-
52
- last_country = nil
53
- last_season = nil
54
- last_keyword = nil
55
- last_teams = []
56
-
57
- OutlineReader.parse( @txt ).each do |node|
58
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
59
- heading_level = node[0][1].to_i
60
- heading = node[1]
61
-
62
- puts "heading #{heading_level} >#{heading}<"
63
-
64
-
65
- if heading_level == 1
66
- ## assume country in heading; allow all "formats" supported by parse e.g.
67
- ## Österreich • Austria (at)
68
- ## Österreich • Austria
69
- ## Austria
70
- ## Deutschland (de) • Germany
71
- country = world.countries.parse( heading )
72
- ## check country code - MUST exist for now!!!!
73
- if country.nil?
74
- puts "!!! error [club history reader] - unknown country >#{heading}< - sorry - add country to config to fix"
75
- exit 1
76
- end
77
- puts " country >#{heading}< => #{country.name}, #{country.key}"
78
- last_country = country
79
- last_season = nil ## reset "lower levels" - season & keyword
80
- last_keyword = nil
81
- elsif heading_level == 2
82
- ## assume season
83
- season = Season.parse( heading )
84
- puts " season >#{heading}< => #{season.key}"
85
- last_season = season ## reset "lowwer levels" - keyword
86
- last_keyword = nil
87
- else
88
- puts "!!! ERROR [club history reader] - for now only heading 1 & 2 supported; sorry"
89
- exit 1
90
- end
91
-
92
- elsif node[0] == :p ## paragraph with (text) lines
93
- if last_country.nil?
94
- puts "!!! ERROR [club history reader] - country heading 1 required, sorry"
95
- exit 1
96
- end
97
- if last_season.nil?
98
- puts "!!! ERROR [club history reader] - season heading 2 required, sorry"
99
- exit 1
100
- end
101
-
102
- lines = node[1]
103
- lines.each do |line|
104
- if m=line.match(KEYWORD_LINE_RE) ## extract keyword and continue
105
- keyword = m[:keyword]
106
- line = m[:text].strip
107
-
108
- puts " keyword #{keyword}"
109
- last_keyword = case keyword ## "normalize" keywords
110
- when 'BANKRUPT', 'BANKRUPTED'
111
- 'BANKRUPT'
112
- when 'RENAME', 'RENAMED'
113
- 'RENAME'
114
- when 'REFORM', 'REFORMED'
115
- 'REFORM'
116
- when 'MOVE', 'MOVED'
117
- 'MOVE'
118
- when 'MERGE', 'MERGED'
119
- 'MERGE'
120
- else
121
- puts "!!! ERROR [club history reader] - unexpected keyword >#{keyword}<; sorry - don't know how to normalize"
122
- exit 1
123
- end
124
-
125
- last_teams = []
126
- end
127
-
128
- if last_keyword.nil?
129
- puts "!!! ERROR [club history reader] - line with keyword expected - got:"
130
- puts line
131
- exit 1
132
- end
133
-
134
- if last_keyword == 'BANKRUPT'
135
- ## requires / expects one team in one line
136
- recs << [ last_keyword, last_season.key,
137
- [ squish(line), last_country.key ]
138
- ]
139
- elsif last_keyword == 'RENAME' ||
140
- last_keyword == 'REFORM' ||
141
- last_keyword == 'MOVE'
142
- ## requires / expects two teams in one line (separated by ⇒ or such)
143
- teams = line.split( '⇒' )
144
- if teams.size != 2
145
- puts "!!! ERROR [club history reader] - expected two teams - got:"
146
- pp teams
147
- exit 1
148
- end
149
- teams = teams.map {|team| squish(team.strip) } ## remove whitespaces
150
- recs << [ last_keyword, last_season.key,
151
- [ teams[0], last_country.key ],
152
- [ teams[1], last_country.key ]
153
- ]
154
- elsif last_keyword == 'MERGE'
155
- ## check if line starts with separator
156
- ## otherwise collect to be merged teams
157
- if line.start_with?( '⇒' )
158
- if last_teams.size < 2
159
- puts "!!! ERROR [club history reader] - expected two or more teams for MERGE - got:"
160
- pp last_teams
161
- exit 1
162
- end
163
- ## auto-add country to all teams
164
- teams = last_teams.map {|team| [team, last_country.key]}
165
- recs << [ last_keyword, last_season.key,
166
- teams,
167
- [ squish(line.sub('⇒','').strip), last_country.key ]
168
- ]
169
-
170
- last_teams = []
171
- else
172
- last_teams << squish(line)
173
- end
174
- else
175
- puts "!!! ERROR [club history reader] - unknown keyword >#{last_keyword}<; cannot process; sorry"
176
- exit 1
177
- end
178
- end # each line (in paragraph)
179
- else
180
- puts "** !!! ERROR [club history reader] - unknown line type:"
181
- pp node
182
- exit 1
183
- end
184
- end
185
-
186
- recs
187
- end # method read
188
-
189
-
190
- ###############
191
- ## helper
192
-
193
- def squish( str )
194
- ## colapse all whitespace to one
195
- str.gsub( /[ ]+/,' ' )
196
- end
197
-
198
-
199
- end # class ClubHistoryReader
200
-
201
-
202
- end ## module Import
203
- end ## module SportDb
@@ -1,90 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubPropsReader
9
-
10
- def catalog() Import.catalog; end
11
-
12
-
13
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
- txt = File.open( path, 'r:utf-8' ) {|f| f.read }
15
- parse( txt )
16
- end
17
-
18
- def self.parse( txt )
19
- new( txt ).parse
20
- end
21
-
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
- def parse
28
- recs = parse_csv( @txt )
29
- recs.each do |rec|
30
- name = rec['Name']
31
- if name.nil?
32
- puts "** !!! ERROR !!! Name column required / missing / NOT found in row:"
33
- pp rec
34
- exit 1
35
- end
36
-
37
- ## find / match club by (canocial) name
38
- m = catalog.clubs.match( name )
39
- if m.size > 1
40
- puts "** !!! WARN !!! ambigious (multiple) club matches (#{m.size}) for name >#{name}< in props row:"
41
- pp rec
42
- pp m
43
-
44
- ## todo/fix: try filter by canonical name if more than one match
45
- m = m.select { |club| club.name == name }
46
- end
47
-
48
- if m.empty?
49
- puts "** !!! ERROR !!! no club match for (canonical) name >#{name}< in props row:"
50
- pp rec
51
- exit 1
52
- elsif m.size > 1
53
- puts "** !!! ERROR !!! ambigious (multiple) club matches (#{m.size}) for (canonical) name >#{name}< in props row:"
54
- pp rec
55
- pp m
56
- exit 1
57
- else ## assume size == 1, bingo!!!
58
- club_rec = m[0]
59
- ## todo/fix: warn if name differes from (canonical) name
60
- ## todo/fix: also add props to in-memory structs/records!!!
61
- ## todo/fix: only updated "on-demand" from in-memory struct/records!!!!
62
-
63
- ## update attributes
64
- club_rec.key = rec['Key'] if is_not_na?( rec['Key'] )
65
- club_rec.code = rec['Code'] if is_not_na?( rec['Code'] )
66
- ## todo/fix: add (some) more props e.g. address, web, etc.
67
- end
68
- end
69
- end # method parse
70
-
71
-
72
- ## allow various values for nil or n/a (not available/applicable) for now
73
- ## add more or less - why? why not?
74
- def is_not_na?( col ) !is_na?( col); end ## check: find a better name - why? why not?
75
-
76
- NA_VARIANTS = ['-', '--', '---',
77
- '?', '??', '???',
78
- '_', '__', '___',
79
- 'na', 'n/a',
80
- 'nil', 'null']
81
-
82
- def is_na?( col )
83
- col.nil? || col.empty? || NA_VARIANTS.include?( col.downcase )
84
- end
85
-
86
-
87
- end # class ClubPropsReader
88
-
89
- end ## module Import
90
- end ## module SportDb