sportdb-formats 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,90 +0,0 @@
1
- #####################
2
- # helpers for parsing & finding match status e.g.
3
- # - cancelled / canceled
4
- # - awarded
5
- # - abandoned
6
- # - replay
7
- # etc.
8
-
9
-
10
- module SportDb
11
-
12
-
13
- ### todo/fix: move Status inside Match struct - why? why not?
14
-
15
- class Status
16
- # note: use a class as an "enum"-like namespace for now - why? why not?
17
- # move class into Match e.g. Match::Status - why? why not?
18
- CANCELLED = 'CANCELLED' # canceled (US spelling), cancelled (UK spelling) - what to use?
19
- AWARDED = 'AWARDED'
20
- POSTPONED = 'POSTPONED'
21
- ABANDONED = 'ABANDONED'
22
- REPLAY = 'REPLAY'
23
- end # class Status
24
-
25
-
26
-
27
- class StatusParser
28
-
29
- def self.parse( str )
30
- ## note: returns nil if no match found
31
- ## note: english usage - cancelled (in UK), canceled (in US)
32
- if str =~ /^(cancelled|
33
- canceled|
34
- can\.
35
- )/xi
36
- Status::CANCELLED
37
- elsif str =~ /^(awarded|
38
- awd\.
39
- )/xi
40
- Status::AWARDED
41
- elsif str =~ /^(postponed
42
- )/xi
43
- Status::POSTPONED
44
- elsif str =~ /^(abandoned|
45
- abd\.
46
- )/xi
47
- Status::ABANDONED
48
- elsif str =~ /^(replay
49
- )/xi
50
- Status::REPLAY
51
- else
52
- # no match
53
- nil
54
- end
55
- end
56
-
57
-
58
- RUN_RE = /\[
59
- (?<text>[^\]]+)
60
- \]
61
- /x
62
- def self.find!( line )
63
- ## for now check all "protected" text run blocks e.g. []
64
- ## puts "line: >#{line}<"
65
-
66
- status = nil
67
-
68
- str = line
69
- while m = str.match( RUN_RE )
70
- str = m.post_match ## keep on processing rest of line/str (a.k.a. post match string)
71
-
72
- ## check for status match
73
- match_str = m[0] ## keep a copy of the match string (for later sub)
74
- text = m[:text].strip
75
- ## puts " text: >#{text}<"
76
-
77
- status = parse( text )
78
-
79
- if status
80
- line.sub!( match_str, "[STATUS.#{status}]" )
81
- break
82
- end
83
- end # while match
84
-
85
- status
86
- end # method find!
87
- end # class StatusParser
88
-
89
- end # module SportDb
90
-
@@ -1,59 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Datafile # note: keep Datafile in its own top-level module/namespace for now - why? why not?
5
-
6
- def self.read( path ) ## todo/check: use as a shortcut helper - why? why not?
7
- ## note: always assume utf-8 for now!!!
8
- File.open( path, 'r:utf-8') {|f| f.read }
9
- end
10
-
11
-
12
- ########################
13
- ## todo/fix: turn into Datafile::Bundle.new and Bundle#write/save -why? why not?
14
- class Bundle
15
- def initialize( path )
16
- @path = path
17
- @buf = String.new('')
18
- end
19
-
20
- def <<(value)
21
- if value.is_a?( Array ) ## assume array of datafiles (file paths)
22
- datafiles = value
23
- datafiles.each do |datafile|
24
- text = Datafile.read( datafile )
25
- ## todo/fix/check: move sub __END__ to Datafile.read and turn it always on - why? why not?
26
- text = text.sub( /__END__.*/m, '' ) ## note: add/allow support for __END__; use m-multiline flag
27
- @buf << text
28
- @buf << "\n\n"
29
- end
30
- else ## assume string (e.g. header, comments, etc.)
31
- text = value
32
- @buf << text
33
- @buf << "\n\n"
34
- end
35
- end
36
- alias_method :write, :<<
37
-
38
- ## todo/fix/check: write only on close? or write on every write and use close for close?
39
- def close
40
- File.open( @path, 'w:utf-8' ) do |f|
41
- f.write @buf
42
- end
43
- end
44
- end # class Bundle
45
-
46
-
47
- def self.write_bundle( path, datafiles:, header: nil )
48
- bundle = Bundle.new( path )
49
- bundle.write( header ) if header
50
- datafiles.each do |datafile|
51
- text = read( datafile )
52
- ## todo/fix/check: move sub __END__ to Datafile.read and turn it always on - why? why not?
53
- text = text.sub( /__END__.*/m, '' ) ## note: add/allow support for __END__; use m-multiline flag
54
- bundle.write( text )
55
- end
56
- bundle.close
57
- end
58
-
59
- end # module Datafile
@@ -1,119 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
- class EventInfoReader
5
- def catalog() Import.catalog; end
6
-
7
-
8
- def self.read( path )
9
- txt = File.open( path, 'r:utf-8') {|f| f.read }
10
- new( txt ).parse
11
- end
12
-
13
- def self.parse( txt )
14
- new( txt ).parse
15
- end
16
-
17
- def initialize( txt )
18
- @txt = txt
19
- end
20
-
21
- def parse
22
- recs = []
23
-
24
- parse_csv( @txt ).each do |row|
25
- league_col = row['League']
26
- season_col = row['Season'] || row['Year']
27
- dates_col = row['Dates']
28
-
29
- season = Season.parse( season_col )
30
- league = catalog.leagues.find!( league_col )
31
-
32
-
33
- dates = []
34
- if dates_col.nil? || dates_col.empty?
35
- ## do nothing; no dates - keep dates array empty
36
- else
37
- ## squish spaces
38
- dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
39
-
40
- puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
41
-
42
- ### todo/check: check what parts "Aug 15" return ???
43
- ### short form for "Aug 15 -" - works?
44
-
45
- ## todo/fix!!! - check EventInfo.include?
46
- ## now allow dates with only start_date too!! (WITHOUT end_date)
47
- parts = dates_col.split( /[ ]*[–-][ ]*/ )
48
- if parts.size == 1
49
- pp parts
50
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
51
- pp dates
52
- elsif parts.size == 2
53
- pp parts
54
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
55
- dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
56
- pp dates
57
-
58
- ## assert/check if period is less than 365 days for now
59
- diff = dates[1].to_date.jd - dates[0].to_date.jd
60
- puts "#{diff}d"
61
- if diff > 365
62
- puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
63
- exit 1
64
- end
65
- else
66
- puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
67
- pp dates_col
68
- pp parts
69
- exit 1
70
- end
71
- end
72
-
73
-
74
- teams_col = row['Clubs'] || row['Teams']
75
- goals_col = row['Goals']
76
-
77
- ## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
78
- teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
79
- goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
80
-
81
- teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
82
- goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
83
-
84
- matches_col = row['Matches']
85
- ## note: support additions in matches (played) e.g.
86
- # 132 + 63 Play-off-Spiele
87
- matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
88
-
89
- matches = if matches_col.nil? || matches_col.empty?
90
- nil
91
- else
92
- if matches_col.index( '+' ) ### check for calculations
93
- ## note: for now only supports additions
94
- matches_col.split( '+' ).reduce( 0 ) do |sum,str|
95
- sum + str.to_i
96
- end
97
- else ## assume single (integer) number
98
- matches_col.to_i
99
- end
100
- end
101
-
102
- rec = EventInfo.new( league: league,
103
- season: season,
104
- start_date: dates[0],
105
- end_date: dates[1],
106
- teams: teams,
107
- matches: matches,
108
- goals: goals
109
- )
110
- recs << rec
111
- end # each row
112
- recs
113
- end # method parse
114
- end # class EventInfoReader
115
-
116
-
117
- end ## module Import
118
- end ## module SportDb
119
-
@@ -1,289 +0,0 @@
1
- ###
2
- # todo - based on ClubReader
3
- # share GeoReader or BaseReader or such for both
4
- # plus maybe for PlayerReader too!!!
5
- #
6
- # fix/todo/cleanup - move alt_names_auto from reader to indexer!!!!
7
- # indexer now handles unaccent (variants) etc.
8
-
9
- module SportDb
10
- module Import
11
-
12
-
13
- class GroundReader
14
-
15
- def world() Import.world; end
16
-
17
-
18
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
19
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
20
- parse( txt )
21
- end
22
-
23
- def self.parse( txt )
24
- new( txt ).parse
25
- end
26
-
27
- def initialize( txt )
28
- @txt = txt
29
- end
30
-
31
-
32
- ## pattern for checking for address line e.g.
33
- ## use just one style / syntax - why? why not?
34
- ## Fischhofgasse 12 ~ 1100 Wien or
35
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
36
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
37
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
38
- (?: ~ | /{2,} | \+{2,} )
39
- (?: [ ]|$) # space or end of line
40
- }x
41
-
42
-
43
- def parse
44
- recs = []
45
- last_rec = nil
46
- headings = [] ## headings stack
47
-
48
- OutlineReader.parse( @txt ).each do |node|
49
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
50
- heading_level = node[0][1].to_i
51
- heading = node[1]
52
-
53
- puts "heading #{heading_level} >#{heading}<"
54
-
55
- ## 1) first pop headings if present
56
- while headings.size+1 > heading_level
57
- headings.pop
58
- end
59
-
60
- ## 2) add missing (hierarchy) level if
61
- while headings.size+1 < heading_level
62
- ## todo/fix: issue warning about "skipping" hierarchy level
63
- puts "!!! warn [ground reader] - skipping hierarchy level in headings "
64
- headings.push( nil )
65
- end
66
-
67
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
68
- ## keep level empty
69
- else
70
- ## note: if level is 1 assume country for now
71
- if heading_level == 1
72
- ## assume country in heading; allow all "formats" supported by parse e.g.
73
- ## Österreich • Austria (at)
74
- ## Österreich • Austria
75
- ## Austria
76
- ## Deutschland (de) • Germany
77
- country = world.countries.parse( heading )
78
- ## check country code - MUST exist for now!!!!
79
- if country.nil?
80
- puts "!!! error [ground reader] - unknown country >#{heading}< - sorry - add country to config to fix"
81
- exit 1
82
- end
83
-
84
- headings.push( country.key )
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif node[0] == :p ## paragraph with (text) lines
108
- lines = node[1]
109
- lines.each do |line|
110
- if line.start_with?( '|' )
111
- ## assume continuation with line of alternative names
112
- ## note: skip leading pipe
113
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
114
- values = values.map {|value| _norm(value) } ## squish/strip etc.
115
-
116
- last_rec.alt_names += values
117
-
118
- ## check for address line e.g.
119
- ## use just one style / syntax - why? why not?
120
- ## Fischhofgasse 12 ~ 1100 Wien or
121
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
122
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
123
- elsif line =~ ADDR_MARKER_RE
124
- ## squish line here - why? why not?
125
- last_rec.address = _squish( line )
126
- else
127
- values = line.split( ',' )
128
-
129
- rec = Ground.new
130
-
131
- col = values.shift ## get first item
132
- ## note: allow optional alt names for convenience with required canoncial name
133
- names = col.split( '|' ) # team names - allow/use pipe(|)
134
- names = names.map {|name| _norm(name) } ## squish/strip etc.
135
-
136
- value = names[0] ## canonical name
137
- alt_names = names[1..-1] ## optional (inline) alt names
138
-
139
- rec.name = value # canoncial name (global unique "beautiful/long" name)
140
- ## note: add optional (inline) alternate names if present
141
- rec.alt_names += alt_names if alt_names.size > 0
142
-
143
- ## note:
144
- ## check/todo!!!!!!!!!!!!!!!!!-
145
- ## strip year if to present e.g. (2011-)
146
- ##
147
- ## do NOT strip for defunct / historic clubs e.g.
148
- ## (1899-1910)
149
- ## or (-1914) or (-2011) etc.
150
-
151
- ###
152
- ## todo: move year out of canonical team name - why? why not?
153
-
154
- ## check if canonical name include (2011-) or similar in name
155
- ## if yes, remove (2011-) and add to (alt) names
156
- ## e.g. New York FC (2011) => New York FC
157
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
158
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
159
-
160
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
161
- rec.year = $1.to_i
162
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
163
- rec.year_end = $1.to_i
164
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
165
- rec.year = $1.to_i
166
- rec.year_end = $2.to_i
167
- else
168
- ## todo/check: warn about unknown year format
169
- end
170
- end
171
-
172
- ## todo/check - check for unknown format values
173
- ## e.g. too many values, duplicate years, etc.
174
- ## check for overwritting, etc.
175
-
176
- ## strip and squish (white)spaces
177
- # e.g. León › Guanajuato => León › Guanajuato
178
- values = values.map {|value| _squish(value) }
179
-
180
- while values.size > 0
181
- value = values.shift
182
- if value =~/^\d{4}$/ # e.g 1904
183
- ## todo/check: issue warning if year is already set!!!!!!!
184
- if rec.year
185
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
186
- pp rec
187
- exit 1
188
- end
189
- rec.year = value.to_i
190
- elsif value =~/^[0-9_]+$/ # e.g 1904
191
- ## skip capacity for now
192
- else
193
- ## assume city / geo tree
194
- ## split into geo tree
195
- geos = split_geo( value )
196
- city = geos[0]
197
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
198
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
199
- rec.district = $1.strip
200
- city = city.gsub( /\(.+?\)/, '' ).strip
201
- end
202
- rec.city = city
203
-
204
- if geos.size > 1
205
- ## cut-off city and keep the rest (of geo tree)
206
- rec.geos = geos[1..-1]
207
- end
208
- end
209
- end ## while values
210
-
211
-
212
- ###############
213
- ## use headings text for geo tree
214
-
215
- ## 1) add country if present
216
- if headings.size > 0 && headings[0]
217
- country = world.countries.find( headings[0] )
218
- rec.country = country
219
- else
220
- ## make it an error - why? why not?
221
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
222
- exit 1
223
- end
224
-
225
- ## 2) check geo tree with headings hierarchy
226
- if headings.size > 1 && headings[1]
227
- geos = split_geo( headings[1] )
228
- if rec.geos
229
- if rec.geos[0] != geos[0]
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
231
- exit 1
232
- end
233
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
234
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
235
- exit 1
236
- end
237
- else
238
- ## add missing region (state/province) from headings hierarchy
239
- rec.geos = geos
240
- end
241
- end
242
-
243
- last_rec = rec
244
-
245
- recs << rec
246
- end
247
- end # each line (in paragraph)
248
- else
249
- puts "** !!! ERROR !!! [ground reader] - unknown line type:"
250
- pp node
251
- exit 1
252
- end
253
- end
254
-
255
- recs
256
- end # method read
257
-
258
- #######################
259
- ### helpers
260
-
261
- def split_geo( str )
262
- ## assume city / geo tree
263
- ## strip and squish (white)spaces
264
- # e.g. León › Guanajuato => León › Guanajuato
265
- str = _squish( str )
266
-
267
- ## split into geo tree
268
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
269
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
270
- geos
271
- end
272
-
273
- ## norm(alize) helper - squish (spaces)
274
- ## and remove dollars ($$$)
275
- ## and remove leading and trailing spaces
276
- def _norm( str )
277
- ## only extra clean-up of dollars for now ($$$)
278
- _squish( str.gsub( '$', '' ) )
279
- end
280
-
281
- def _squish( str )
282
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
283
- end
284
-
285
- end # class GroundReader
286
-
287
-
288
- end ## module Import
289
- end ## module SportDb