sportdb-formats 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,318 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
-
5
-
6
- class ClubReader
7
-
8
- def world() Import.world; end
9
-
10
-
11
-
12
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
13
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
14
- parse( txt )
15
- end
16
-
17
- def self.parse( txt )
18
- new( txt ).parse
19
- end
20
-
21
- def initialize( txt )
22
- @txt = txt
23
- end
24
-
25
- ## pattern for b (child) team / club marker e.g.
26
- ## (ii) or ii) or ii.) or (ii.) or (II)
27
- ## (b) or b) or b.) or (b.) or (B)
28
- ## (2) or 2) or 2.) or (2.)
29
- B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
30
- (?: ii|b|2 )
31
- \.? # optional dot - keep and allow dot - why? why not?
32
- \) # required closing bracket
33
- }xi ## note: add case-insenstive (e.g. II/ii or B/b)
34
-
35
- ## pattern for checking for address line e.g.
36
- ## use just one style / syntax - why? why not?
37
- ## Fischhofgasse 12 ~ 1100 Wien or
38
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
39
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
40
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
41
- (?: ~ | /{2,} | \+{2,} )
42
- (?: [ ]|$) # space or end of line
43
- }x
44
-
45
-
46
-
47
- def parse
48
- recs = []
49
- last_rec = nil
50
- headings = [] ## headings stack
51
-
52
- OutlineReader.parse( @txt ).each do |node|
53
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
54
- heading_level = node[0][1].to_i
55
- heading = node[1]
56
-
57
- puts "heading #{heading_level} >#{heading}<"
58
-
59
- ## 1) first pop headings if present
60
- while headings.size+1 > heading_level
61
- headings.pop
62
- end
63
-
64
- ## 2) add missing (hierarchy) level if
65
- while headings.size+1 < heading_level
66
- ## todo/fix: issue warning about "skipping" hierarchy level
67
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
68
- headings.push( nil )
69
- end
70
-
71
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
72
- ## keep level empty
73
- else
74
- ## note: if level is 1 assume country for now
75
- if heading_level == 1
76
- ## assume country in heading; allow all "formats" supported by parse e.g.
77
- ## Österreich • Austria (at)
78
- ## Österreich • Austria
79
- ## Austria
80
- ## Deutschland (de) • Germany
81
- country = world.countries.parse( heading )
82
- ## check country code - MUST exist for now!!!!
83
- if country.nil?
84
- puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
85
- exit 1
86
- end
87
-
88
- headings.push( country.key )
89
- else
90
- ## quick hack:
91
- ## remove known fill/dummy words incl:
92
- ## Provincia San Juan => San Juan (see argentina, for example)
93
- ##
94
- ## use geo tree long term with alternative names - why? why not?
95
- words = ['Provincia']
96
- words.each { |word| heading = heading.gsub( word, '' ) }
97
- heading = heading.strip
98
-
99
- headings.push( heading )
100
- end
101
-
102
- ## assert that hierarchy level is ok
103
- if headings.size != heading_level
104
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
105
- exit 1
106
- end
107
- end
108
-
109
- pp headings
110
-
111
- elsif node[0] == :p ## paragraph with (text) lines
112
- lines = node[1]
113
- lines.each do |line|
114
- if line.start_with?( '|' )
115
- ## assume continuation with line of alternative names
116
- ## note: skip leading pipe
117
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
118
- values = values.map {|value| _norm(value) } ## squish/strip etc.
119
-
120
- last_rec.alt_names += values
121
-
122
- ## check for b (child) team / club marker e.g.
123
- ## (ii) or ii) or ii.) or (ii.)
124
- ## (b) or b) or b.) or (b.)
125
- ## (2) or 2) or 2.) or (2.)
126
- elsif line =~ B_TEAM_MARKER_RE
127
- line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
128
-
129
- ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
130
- # for both a and b team / club
131
- rec = Club.new
132
- ## note: assume / allow just canonical name for now
133
- ## strip and squish (white)spaces
134
- # e.g. New York FC (2011-) => New York FC (2011-)
135
- rec.name = _norm( line ) # canoncial name (global unique "beautiful/long" name)
136
-
137
- ### link a and b team / clubs
138
- ## assume last_rec is the a team
139
- ## todo/fix: check last_rec required NOT null
140
- rec.a = last_rec
141
- last_rec.b = rec
142
-
143
- last_rec = rec
144
- recs << rec
145
-
146
- ## check for address line e.g.
147
- ## use just one style / syntax - why? why not?
148
- ## Fischhofgasse 12 ~ 1100 Wien or
149
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
150
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
151
- elsif line =~ ADDR_MARKER_RE
152
- last_rec.address = _squish( line )
153
- else
154
- values = line.split( ',' )
155
-
156
- rec = Club.new
157
-
158
- col = values.shift ## get first item
159
- ## note: allow optional alt names for convenience with required canoncial name
160
- names = col.split( '|' ) # team names - allow/use pipe(|)
161
- names = names.map {|name| _norm(name) } ## squish/strip etc.
162
-
163
- value = names[0] ## canonical name
164
- alt_names = names[1..-1] ## optional (inline) alt names
165
-
166
- rec.name = value # canoncial name (global unique "beautiful/long" name)
167
- ## note: add optional (inline) alternate names if present
168
- rec.alt_names += alt_names if alt_names.size > 0
169
-
170
- ## note:
171
- ## check/todo!!!!!!!!!!!!!!!!!-
172
- ## strip year if to present e.g. (2011-)
173
- ##
174
- ## do NOT strip for defunct / historic clubs e.g.
175
- ## (1899-1910)
176
- ## or (-1914) or (-2011) etc.
177
-
178
- ###
179
- ## todo: move year out of canonical team name - why? why not?
180
-
181
- ## check if canonical name include (2011-) or similar in name
182
- ## if yes, remove (2011-) and add to (alt) names
183
- ## e.g. New York FC (2011) => New York FC
184
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
185
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
186
-
187
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
188
- rec.year = $1.to_i
189
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
190
- rec.year_end = $1.to_i
191
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
192
- rec.year = $1.to_i
193
- rec.year_end = $2.to_i
194
- else
195
- ## todo/check: warn about unknown year format
196
- end
197
- end
198
-
199
- ## todo/check - check for unknown format values
200
- ## e.g. too many values, duplicate years, etc.
201
- ## check for overwritting, etc.
202
-
203
- ## strip and squish (white)spaces
204
- # e.g. León › Guanajuato => León › Guanajuato
205
- values = values.map {|value| _squish(value) }
206
-
207
- while values.size > 0
208
- value = values.shift
209
- if value =~/^\d{4}$/ # e.g 1904
210
- ## todo/check: issue warning if year is already set!!!!!!!
211
- if rec.year
212
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
213
- pp rec
214
- exit 1
215
- end
216
- rec.year = value.to_i
217
- elsif value.start_with?( '@' ) # e.g. @ Anfield
218
- ## cut-off leading @ and spaces
219
- rec.ground = value[1..-1].strip
220
- else
221
- ## assume city / geo tree
222
- ## split into geo tree
223
- geos = split_geo( value )
224
- city = geos[0]
225
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
226
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
227
- rec.district = $1.strip
228
- city = city.gsub( /\(.+?\)/, '' ).strip
229
- end
230
- rec.city = city
231
-
232
- if geos.size > 1
233
- ## cut-off city and keep the rest (of geo tree)
234
- rec.geos = geos[1..-1]
235
- end
236
- end
237
- end ## while values
238
-
239
-
240
- ###############
241
- ## use headings text for geo tree
242
-
243
- ## 1) add country if present
244
- if headings.size > 0 && headings[0]
245
- country = world.countries.find( headings[0] )
246
- rec.country = country
247
- else
248
- ## make it an error - why? why not?
249
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
250
- exit 1
251
- end
252
-
253
- ## 2) check geo tree with headings hierarchy
254
- if headings.size > 1 && headings[1]
255
- geos = split_geo( headings[1] )
256
- if rec.geos
257
- if rec.geos[0] != geos[0]
258
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
259
- exit 1
260
- end
261
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
262
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
263
- exit 1
264
- end
265
- else
266
- ## add missing region (state/province) from headings hierarchy
267
- rec.geos = geos
268
- end
269
- end
270
-
271
- last_rec = rec
272
-
273
- recs << rec
274
- end
275
- end # each line (in paragraph)
276
- else
277
- puts "** !!! ERROR !!! [club reader] - unknown line type:"
278
- pp node
279
- exit 1
280
- end
281
- end
282
-
283
- recs
284
- end # method read
285
-
286
- #######################
287
- ### helpers
288
- def split_geo( str )
289
- ## assume city / geo tree
290
- ## strip and squish (white)spaces
291
- # e.g. León › Guanajuato => León › Guanajuato
292
- str = _squish( str )
293
-
294
- ## split into geo tree
295
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
296
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
297
- geos
298
- end
299
-
300
-
301
- ## norm(alize) helper - squish (spaces)
302
- ## and remove dollars ($$$)
303
- ## and remove leading and trailing spaces
304
- def _norm( str )
305
- ## only extra clean-up of dollars for now ($$$)
306
- _squish( str.gsub( '$', '' ) )
307
- end
308
-
309
- def _squish( str )
310
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
311
- end
312
-
313
-
314
- end # class ClubReader
315
-
316
-
317
- end ## module Import
318
- end ## module SportDb
@@ -1,203 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubHistoryReader
9
-
10
- def world() Import.world; end
11
-
12
-
13
-
14
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
- parse( txt )
17
- end
18
-
19
- def self.parse( txt )
20
- new( txt ).parse
21
- end
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
-
28
- ###
29
- ## RENAME/RENAMED
30
- ## MOVE/MOVED
31
- ## BANKRUPT/BANKRUPTED
32
- ## REFORM/REFORMED
33
- ## MERGE/MERGED - allow + or ++ or +++ or ; for "inline" - why? why not?
34
-
35
-
36
- KEYWORD_LINE_RE = %r{ ^(?<keyword>RENAMED?|
37
- MOVED?|
38
- BANKRUPT(?:ED)?|
39
- REFORM(?:ED)?|
40
- MERGED?
41
- )
42
- [ ]+
43
- (?<text>.*) # rest of text
44
- $
45
- }x
46
-
47
-
48
- def parse
49
- recs = []
50
- last_rec = nil
51
-
52
- last_country = nil
53
- last_season = nil
54
- last_keyword = nil
55
- last_teams = []
56
-
57
- OutlineReader.parse( @txt ).each do |node|
58
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
59
- heading_level = node[0][1].to_i
60
- heading = node[1]
61
-
62
- puts "heading #{heading_level} >#{heading}<"
63
-
64
-
65
- if heading_level == 1
66
- ## assume country in heading; allow all "formats" supported by parse e.g.
67
- ## Österreich • Austria (at)
68
- ## Österreich • Austria
69
- ## Austria
70
- ## Deutschland (de) • Germany
71
- country = world.countries.parse( heading )
72
- ## check country code - MUST exist for now!!!!
73
- if country.nil?
74
- puts "!!! error [club history reader] - unknown country >#{heading}< - sorry - add country to config to fix"
75
- exit 1
76
- end
77
- puts " country >#{heading}< => #{country.name}, #{country.key}"
78
- last_country = country
79
- last_season = nil ## reset "lower levels" - season & keyword
80
- last_keyword = nil
81
- elsif heading_level == 2
82
- ## assume season
83
- season = Season.parse( heading )
84
- puts " season >#{heading}< => #{season.key}"
85
- last_season = season ## reset "lowwer levels" - keyword
86
- last_keyword = nil
87
- else
88
- puts "!!! ERROR [club history reader] - for now only heading 1 & 2 supported; sorry"
89
- exit 1
90
- end
91
-
92
- elsif node[0] == :p ## paragraph with (text) lines
93
- if last_country.nil?
94
- puts "!!! ERROR [club history reader] - country heading 1 required, sorry"
95
- exit 1
96
- end
97
- if last_season.nil?
98
- puts "!!! ERROR [club history reader] - season heading 2 required, sorry"
99
- exit 1
100
- end
101
-
102
- lines = node[1]
103
- lines.each do |line|
104
- if m=line.match(KEYWORD_LINE_RE) ## extract keyword and continue
105
- keyword = m[:keyword]
106
- line = m[:text].strip
107
-
108
- puts " keyword #{keyword}"
109
- last_keyword = case keyword ## "normalize" keywords
110
- when 'BANKRUPT', 'BANKRUPTED'
111
- 'BANKRUPT'
112
- when 'RENAME', 'RENAMED'
113
- 'RENAME'
114
- when 'REFORM', 'REFORMED'
115
- 'REFORM'
116
- when 'MOVE', 'MOVED'
117
- 'MOVE'
118
- when 'MERGE', 'MERGED'
119
- 'MERGE'
120
- else
121
- puts "!!! ERROR [club history reader] - unexpected keyword >#{keyword}<; sorry - don't know how to normalize"
122
- exit 1
123
- end
124
-
125
- last_teams = []
126
- end
127
-
128
- if last_keyword.nil?
129
- puts "!!! ERROR [club history reader] - line with keyword expected - got:"
130
- puts line
131
- exit 1
132
- end
133
-
134
- if last_keyword == 'BANKRUPT'
135
- ## requires / expects one team in one line
136
- recs << [ last_keyword, last_season.key,
137
- [ squish(line), last_country.key ]
138
- ]
139
- elsif last_keyword == 'RENAME' ||
140
- last_keyword == 'REFORM' ||
141
- last_keyword == 'MOVE'
142
- ## requires / expects two teams in one line (separated by ⇒ or such)
143
- teams = line.split( '⇒' )
144
- if teams.size != 2
145
- puts "!!! ERROR [club history reader] - expected two teams - got:"
146
- pp teams
147
- exit 1
148
- end
149
- teams = teams.map {|team| squish(team.strip) } ## remove whitespaces
150
- recs << [ last_keyword, last_season.key,
151
- [ teams[0], last_country.key ],
152
- [ teams[1], last_country.key ]
153
- ]
154
- elsif last_keyword == 'MERGE'
155
- ## check if line starts with separator
156
- ## otherwise collect to be merged teams
157
- if line.start_with?( '⇒' )
158
- if last_teams.size < 2
159
- puts "!!! ERROR [club history reader] - expected two or more teams for MERGE - got:"
160
- pp last_teams
161
- exit 1
162
- end
163
- ## auto-add country to all teams
164
- teams = last_teams.map {|team| [team, last_country.key]}
165
- recs << [ last_keyword, last_season.key,
166
- teams,
167
- [ squish(line.sub('⇒','').strip), last_country.key ]
168
- ]
169
-
170
- last_teams = []
171
- else
172
- last_teams << squish(line)
173
- end
174
- else
175
- puts "!!! ERROR [club history reader] - unknown keyword >#{last_keyword}<; cannot process; sorry"
176
- exit 1
177
- end
178
- end # each line (in paragraph)
179
- else
180
- puts "** !!! ERROR [club history reader] - unknown line type:"
181
- pp node
182
- exit 1
183
- end
184
- end
185
-
186
- recs
187
- end # method read
188
-
189
-
190
- ###############
191
- ## helper
192
-
193
- def squish( str )
194
- ## colapse all whitespace to one
195
- str.gsub( /[ ]+/,' ' )
196
- end
197
-
198
-
199
- end # class ClubHistoryReader
200
-
201
-
202
- end ## module Import
203
- end ## module SportDb
@@ -1,90 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubPropsReader
9
-
10
- def catalog() Import.catalog; end
11
-
12
-
13
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
- txt = File.open( path, 'r:utf-8' ) {|f| f.read }
15
- parse( txt )
16
- end
17
-
18
- def self.parse( txt )
19
- new( txt ).parse
20
- end
21
-
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
- def parse
28
- recs = parse_csv( @txt )
29
- recs.each do |rec|
30
- name = rec['Name']
31
- if name.nil?
32
- puts "** !!! ERROR !!! Name column required / missing / NOT found in row:"
33
- pp rec
34
- exit 1
35
- end
36
-
37
- ## find / match club by (canocial) name
38
- m = catalog.clubs.match( name )
39
- if m.size > 1
40
- puts "** !!! WARN !!! ambigious (multiple) club matches (#{m.size}) for name >#{name}< in props row:"
41
- pp rec
42
- pp m
43
-
44
- ## todo/fix: try filter by canonical name if more than one match
45
- m = m.select { |club| club.name == name }
46
- end
47
-
48
- if m.empty?
49
- puts "** !!! ERROR !!! no club match for (canonical) name >#{name}< in props row:"
50
- pp rec
51
- exit 1
52
- elsif m.size > 1
53
- puts "** !!! ERROR !!! ambigious (multiple) club matches (#{m.size}) for (canonical) name >#{name}< in props row:"
54
- pp rec
55
- pp m
56
- exit 1
57
- else ## assume size == 1, bingo!!!
58
- club_rec = m[0]
59
- ## todo/fix: warn if name differes from (canonical) name
60
- ## todo/fix: also add props to in-memory structs/records!!!
61
- ## todo/fix: only updated "on-demand" from in-memory struct/records!!!!
62
-
63
- ## update attributes
64
- club_rec.key = rec['Key'] if is_not_na?( rec['Key'] )
65
- club_rec.code = rec['Code'] if is_not_na?( rec['Code'] )
66
- ## todo/fix: add (some) more props e.g. address, web, etc.
67
- end
68
- end
69
- end # method parse
70
-
71
-
72
- ## allow various values for nil or n/a (not available/applicable) for now
73
- ## add more or less - why? why not?
74
- def is_not_na?( col ) !is_na?( col); end ## check: find a better name - why? why not?
75
-
76
- NA_VARIANTS = ['-', '--', '---',
77
- '?', '??', '???',
78
- '_', '__', '___',
79
- 'na', 'n/a',
80
- 'nil', 'null']
81
-
82
- def is_na?( col )
83
- col.nil? || col.empty? || NA_VARIANTS.include?( col.downcase )
84
- end
85
-
86
-
87
- end # class ClubPropsReader
88
-
89
- end ## module Import
90
- end ## module SportDb