sportdb-formats 2.0.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,119 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
- class EventInfoReader
5
- def catalog() Import.catalog; end
6
-
7
-
8
- def self.read( path )
9
- txt = File.open( path, 'r:utf-8') {|f| f.read }
10
- new( txt ).parse
11
- end
12
-
13
- def self.parse( txt )
14
- new( txt ).parse
15
- end
16
-
17
- def initialize( txt )
18
- @txt = txt
19
- end
20
-
21
- def parse
22
- recs = []
23
-
24
- parse_csv( @txt ).each do |row|
25
- league_col = row['League']
26
- season_col = row['Season'] || row['Year']
27
- dates_col = row['Dates']
28
-
29
- season = Season.parse( season_col )
30
- league = catalog.leagues.find!( league_col )
31
-
32
-
33
- dates = []
34
- if dates_col.nil? || dates_col.empty?
35
- ## do nothing; no dates - keep dates array empty
36
- else
37
- ## squish spaces
38
- dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
39
-
40
- puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
41
-
42
- ### todo/check: check what parts "Aug 15" return ???
43
- ### short form for "Aug 15 -" - works?
44
-
45
- ## todo/fix!!! - check EventInfo.include?
46
- ## now allow dates with only start_date too!! (WITHOUT end_date)
47
- parts = dates_col.split( /[ ]*[–-][ ]*/ )
48
- if parts.size == 1
49
- pp parts
50
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
51
- pp dates
52
- elsif parts.size == 2
53
- pp parts
54
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
55
- dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
56
- pp dates
57
-
58
- ## assert/check if period is less than 365 days for now
59
- diff = dates[1].to_date.jd - dates[0].to_date.jd
60
- puts "#{diff}d"
61
- if diff > 365
62
- puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
63
- exit 1
64
- end
65
- else
66
- puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
67
- pp dates_col
68
- pp parts
69
- exit 1
70
- end
71
- end
72
-
73
-
74
- teams_col = row['Clubs'] || row['Teams']
75
- goals_col = row['Goals']
76
-
77
- ## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
78
- teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
79
- goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
80
-
81
- teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
82
- goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
83
-
84
- matches_col = row['Matches']
85
- ## note: support additions in matches (played) e.g.
86
- # 132 + 63 Play-off-Spiele
87
- matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
88
-
89
- matches = if matches_col.nil? || matches_col.empty?
90
- nil
91
- else
92
- if matches_col.index( '+' ) ### check for calculations
93
- ## note: for now only supports additions
94
- matches_col.split( '+' ).reduce( 0 ) do |sum,str|
95
- sum + str.to_i
96
- end
97
- else ## assume single (integer) number
98
- matches_col.to_i
99
- end
100
- end
101
-
102
- rec = EventInfo.new( league: league,
103
- season: season,
104
- start_date: dates[0],
105
- end_date: dates[1],
106
- teams: teams,
107
- matches: matches,
108
- goals: goals
109
- )
110
- recs << rec
111
- end # each row
112
- recs
113
- end # method parse
114
- end # class EventInfoReader
115
-
116
-
117
- end ## module Import
118
- end ## module SportDb
119
-
@@ -1,289 +0,0 @@
1
- ###
2
- # todo - based on ClubReader
3
- # share GeoReader or BaseReader or such for both
4
- # plus maybe for PlayerReader too!!!
5
- #
6
- # fix/todo/cleanup - move alt_names_auto from reader to indexer!!!!
7
- # indexer now handles unaccent (variants) etc.
8
-
9
- module SportDb
10
- module Import
11
-
12
-
13
- class GroundReader
14
-
15
- def world() Import.world; end
16
-
17
-
18
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
19
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
20
- parse( txt )
21
- end
22
-
23
- def self.parse( txt )
24
- new( txt ).parse
25
- end
26
-
27
- def initialize( txt )
28
- @txt = txt
29
- end
30
-
31
-
32
- ## pattern for checking for address line e.g.
33
- ## use just one style / syntax - why? why not?
34
- ## Fischhofgasse 12 ~ 1100 Wien or
35
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
36
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
37
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
38
- (?: ~ | /{2,} | \+{2,} )
39
- (?: [ ]|$) # space or end of line
40
- }x
41
-
42
-
43
- def parse
44
- recs = []
45
- last_rec = nil
46
- headings = [] ## headings stack
47
-
48
- OutlineReader.parse( @txt ).each do |node|
49
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
50
- heading_level = node[0][1].to_i
51
- heading = node[1]
52
-
53
- puts "heading #{heading_level} >#{heading}<"
54
-
55
- ## 1) first pop headings if present
56
- while headings.size+1 > heading_level
57
- headings.pop
58
- end
59
-
60
- ## 2) add missing (hierarchy) level if
61
- while headings.size+1 < heading_level
62
- ## todo/fix: issue warning about "skipping" hierarchy level
63
- puts "!!! warn [ground reader] - skipping hierarchy level in headings "
64
- headings.push( nil )
65
- end
66
-
67
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
68
- ## keep level empty
69
- else
70
- ## note: if level is 1 assume country for now
71
- if heading_level == 1
72
- ## assume country in heading; allow all "formats" supported by parse e.g.
73
- ## Österreich • Austria (at)
74
- ## Österreich • Austria
75
- ## Austria
76
- ## Deutschland (de) • Germany
77
- country = world.countries.parse( heading )
78
- ## check country code - MUST exist for now!!!!
79
- if country.nil?
80
- puts "!!! error [ground reader] - unknown country >#{heading}< - sorry - add country to config to fix"
81
- exit 1
82
- end
83
-
84
- headings.push( country.key )
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif node[0] == :p ## paragraph with (text) lines
108
- lines = node[1]
109
- lines.each do |line|
110
- if line.start_with?( '|' )
111
- ## assume continuation with line of alternative names
112
- ## note: skip leading pipe
113
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
114
- values = values.map {|value| _norm(value) } ## squish/strip etc.
115
-
116
- last_rec.alt_names += values
117
-
118
- ## check for address line e.g.
119
- ## use just one style / syntax - why? why not?
120
- ## Fischhofgasse 12 ~ 1100 Wien or
121
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
122
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
123
- elsif line =~ ADDR_MARKER_RE
124
- ## squish line here - why? why not?
125
- last_rec.address = _squish( line )
126
- else
127
- values = line.split( ',' )
128
-
129
- rec = Ground.new
130
-
131
- col = values.shift ## get first item
132
- ## note: allow optional alt names for convenience with required canoncial name
133
- names = col.split( '|' ) # team names - allow/use pipe(|)
134
- names = names.map {|name| _norm(name) } ## squish/strip etc.
135
-
136
- value = names[0] ## canonical name
137
- alt_names = names[1..-1] ## optional (inline) alt names
138
-
139
- rec.name = value # canoncial name (global unique "beautiful/long" name)
140
- ## note: add optional (inline) alternate names if present
141
- rec.alt_names += alt_names if alt_names.size > 0
142
-
143
- ## note:
144
- ## check/todo!!!!!!!!!!!!!!!!!-
145
- ## strip year if to present e.g. (2011-)
146
- ##
147
- ## do NOT strip for defunct / historic clubs e.g.
148
- ## (1899-1910)
149
- ## or (-1914) or (-2011) etc.
150
-
151
- ###
152
- ## todo: move year out of canonical team name - why? why not?
153
-
154
- ## check if canonical name include (2011-) or similar in name
155
- ## if yes, remove (2011-) and add to (alt) names
156
- ## e.g. New York FC (2011) => New York FC
157
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
158
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
159
-
160
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
161
- rec.year = $1.to_i
162
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
163
- rec.year_end = $1.to_i
164
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
165
- rec.year = $1.to_i
166
- rec.year_end = $2.to_i
167
- else
168
- ## todo/check: warn about unknown year format
169
- end
170
- end
171
-
172
- ## todo/check - check for unknown format values
173
- ## e.g. too many values, duplicate years, etc.
174
- ## check for overwritting, etc.
175
-
176
- ## strip and squish (white)spaces
177
- # e.g. León › Guanajuato => León › Guanajuato
178
- values = values.map {|value| _squish(value) }
179
-
180
- while values.size > 0
181
- value = values.shift
182
- if value =~/^\d{4}$/ # e.g 1904
183
- ## todo/check: issue warning if year is already set!!!!!!!
184
- if rec.year
185
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
186
- pp rec
187
- exit 1
188
- end
189
- rec.year = value.to_i
190
- elsif value =~/^[0-9_]+$/ # e.g 1904
191
- ## skip capacity for now
192
- else
193
- ## assume city / geo tree
194
- ## split into geo tree
195
- geos = split_geo( value )
196
- city = geos[0]
197
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
198
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
199
- rec.district = $1.strip
200
- city = city.gsub( /\(.+?\)/, '' ).strip
201
- end
202
- rec.city = city
203
-
204
- if geos.size > 1
205
- ## cut-off city and keep the rest (of geo tree)
206
- rec.geos = geos[1..-1]
207
- end
208
- end
209
- end ## while values
210
-
211
-
212
- ###############
213
- ## use headings text for geo tree
214
-
215
- ## 1) add country if present
216
- if headings.size > 0 && headings[0]
217
- country = world.countries.find( headings[0] )
218
- rec.country = country
219
- else
220
- ## make it an error - why? why not?
221
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
222
- exit 1
223
- end
224
-
225
- ## 2) check geo tree with headings hierarchy
226
- if headings.size > 1 && headings[1]
227
- geos = split_geo( headings[1] )
228
- if rec.geos
229
- if rec.geos[0] != geos[0]
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
231
- exit 1
232
- end
233
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
234
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
235
- exit 1
236
- end
237
- else
238
- ## add missing region (state/province) from headings hierarchy
239
- rec.geos = geos
240
- end
241
- end
242
-
243
- last_rec = rec
244
-
245
- recs << rec
246
- end
247
- end # each line (in paragraph)
248
- else
249
- puts "** !!! ERROR !!! [ground reader] - unknown line type:"
250
- pp node
251
- exit 1
252
- end
253
- end
254
-
255
- recs
256
- end # method read
257
-
258
- #######################
259
- ### helpers
260
-
261
- def split_geo( str )
262
- ## assume city / geo tree
263
- ## strip and squish (white)spaces
264
- # e.g. León › Guanajuato => León › Guanajuato
265
- str = _squish( str )
266
-
267
- ## split into geo tree
268
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
269
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
270
- geos
271
- end
272
-
273
- ## norm(alize) helper - squish (spaces)
274
- ## and remove dollars ($$$)
275
- ## and remove leading and trailing spaces
276
- def _norm( str )
277
- ## only extra clean-up of dollars for now ($$$)
278
- _squish( str.gsub( '$', '' ) )
279
- end
280
-
281
- def _squish( str )
282
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
283
- end
284
-
285
- end # class GroundReader
286
-
287
-
288
- end ## module Import
289
- end ## module SportDb
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
-
6
- ## shared "higher-level" outline reader
7
- ## todo: add CountryOutlineReader - why? why not?
8
-
9
- class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
10
-
11
- def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ) {|f| f.read }
13
- parse( txt, season: season )
14
- end
15
-
16
- def self.parse( txt, season: nil )
17
- new( txt ).parse( season: season )
18
- end
19
-
20
-
21
- def initialize( txt )
22
- @txt = txt
23
- end
24
-
25
- def parse( season: nil )
26
- secs=[] # sec(tion)s
27
- OutlineReader.parse( @txt ).each do |node|
28
- if node[0] == :h1
29
- ## check for league (and stage) and season
30
- heading = node[1]
31
- values = split_league( heading )
32
- if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
33
- puts "league >#{m[:league]}<, season >#{m[:season]}<"
34
-
35
- secs << { league: m[:league],
36
- season: m[:season],
37
- stage: values[1], ## note: defaults to nil if not present
38
- lines: []
39
- }
40
- else
41
- puts "** !!! ERROR - cannot match league and season in heading; season missing?"
42
- pp heading
43
- exit 1
44
- end
45
- elsif node[0] == :p ## paragraph with (text) lines
46
- lines = node[1]
47
- ## note: skip lines if no heading seen
48
- if secs.empty?
49
- puts "** !!! WARN - skipping lines (no heading):"
50
- pp lines
51
- else
52
- ## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
53
- secs[-1][:lines] += lines
54
- end
55
- else
56
- puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
57
- pp node
58
- exit 1
59
- end
60
- end
61
-
62
-
63
- ## pass 2 - filter seasons if filter present
64
- if season
65
- filtered_secs = []
66
- filter = norm_seasons( season )
67
- secs.each do |sec|
68
- if filter.include?( Season.parse( sec[:season] ).key )
69
- filtered_secs << sec
70
- else
71
- puts " skipping season >#{sec[:season]}< NOT matched by filter"
72
- end
73
- end
74
- secs = filtered_secs
75
- end
76
-
77
- ## pass 3 - check & map; replace inline (string with data struct record)
78
- secs.each do |sec|
79
- sec[:season] = Season.parse( sec[:season ] )
80
- sec[:league] = catalog.leagues.find!( sec[:league] )
81
-
82
- check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
83
- end
84
-
85
- secs
86
- end # method parse
87
-
88
-
89
-
90
- def catalog() Import.catalog; end ## shortcut convenience helper
91
-
92
- ## split into league + season
93
- ## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
94
- ## World Cup 2018
95
- LEAGUE_SEASON_HEADING_RE = %r{^
96
- (?<league>.+?) ## non-greedy
97
- \s+
98
- (?<season>\d{4}
99
- (?:[\/-]\d{1,4})? ## optional 2nd year in season
100
- )
101
- $}x
102
-
103
-
104
- def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
105
-
106
- seasons = if season_or_seasons.is_a?( Array ) # is it an array already
107
- season_or_seasons
108
- elsif season_or_seasons.is_a?( Range ) # e.g. Season(1999)..Season(2001) or such
109
- season_or_seasons.to_a
110
- else ## assume - single entry - wrap in array
111
- [season_or_seasons]
112
- end
113
-
114
- seasons.map { |season| Season( season ).key }
115
- end
116
-
117
-
118
- def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
119
- ## split into league / stage / ... e.g.
120
- ## => Österr. Bundesliga 2018/19, Regular Season
121
- ## => Österr. Bundesliga 2018/19, Championship Round
122
- ## etc.
123
- values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
124
- values = values.map { |value| value.strip } ## remove all whitespaces
125
- values
126
- end
127
-
128
-
129
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
130
- KNOWN_STAGES = [
131
- 'Regular Season',
132
- 'Regular Stage',
133
- 'Championship Round',
134
- 'Championship Playoff', # or Championship play-off
135
- 'Relegation Round',
136
- 'Relegation Playoff',
137
- 'Play-offs',
138
- 'Playoff Stage',
139
- 'Grunddurchgang',
140
- 'Finaldurchgang - Qualifikationsgruppe',
141
- 'Finaldurchgang - Qualifikation',
142
- 'Finaldurchgang - Meistergruppe',
143
- 'Finaldurchgang - Meister',
144
- 'EL Play-off',
145
- 'Europa League Play-off',
146
- 'Europa-League-Play-offs',
147
- 'Europa League Finals',
148
- 'Playoffs - Championship',
149
- 'Playoffs - Europa League',
150
- 'Playoffs - Europa League - Finals',
151
- 'Playoffs - Relegation',
152
- 'Playoffs - Challenger',
153
- 'Finals',
154
- 'Match 6th Place', # e.g. Super League Greece 2012/13
155
-
156
- 'Apertura',
157
- 'Apertura - Liguilla',
158
- 'Clausura',
159
- 'Clausura - Liguilla',
160
-
161
- ].map {|name| name.downcase.gsub( /[^a-z]/, '' ) }
162
-
163
-
164
- def check_stage( name )
165
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
166
- if KNOWN_STAGES.include?( name.downcase.gsub( /[^a-z]/, '' ) )
167
- ## everything ok
168
- else
169
- puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
170
- exit 1
171
- end
172
- end
173
-
174
- end # class LeagueOutlineReader
175
-
176
- end # module SportDb