sportdb-formats 1.1.5 → 1.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,203 +1,203 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubHistoryReader
9
-
10
- def catalog() Import.catalog; end
11
-
12
-
13
-
14
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
- parse( txt )
17
- end
18
-
19
- def self.parse( txt )
20
- new( txt ).parse
21
- end
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
-
28
- ###
29
- ## RENAME/RENAMED
30
- ## MOVE/MOVED
31
- ## BANKRUPT/BANKRUPTED
32
- ## REFORM/REFORMED
33
- ## MERGE/MERGED - allow + or ++ or +++ or ; for "inline" - why? why not?
34
-
35
-
36
- KEYWORD_LINE_RE = %r{ ^(?<keyword>RENAMED?|
37
- MOVED?|
38
- BANKRUPT(?:ED)?|
39
- REFORM(?:ED)?|
40
- MERGED?
41
- )
42
- [ ]+
43
- (?<text>.*) # rest of text
44
- $
45
- }x
46
-
47
-
48
- def parse
49
- recs = []
50
- last_rec = nil
51
-
52
- last_country = nil
53
- last_season = nil
54
- last_keyword = nil
55
- last_teams = []
56
-
57
- OutlineReader.parse( @txt ).each do |node|
58
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
59
- heading_level = node[0][1].to_i
60
- heading = node[1]
61
-
62
- puts "heading #{heading_level} >#{heading}<"
63
-
64
-
65
- if heading_level == 1
66
- ## assume country in heading; allow all "formats" supported by parse e.g.
67
- ## Österreich • Austria (at)
68
- ## Österreich • Austria
69
- ## Austria
70
- ## Deutschland (de) • Germany
71
- country = catalog.countries.parse( heading )
72
- ## check country code - MUST exist for now!!!!
73
- if country.nil?
74
- puts "!!! error [club history reader] - unknown country >#{heading}< - sorry - add country to config to fix"
75
- exit 1
76
- end
77
- puts " country >#{heading}< => #{country.name}, #{country.key}"
78
- last_country = country
79
- last_season = nil ## reset "lower levels" - season & keyword
80
- last_keyword = nil
81
- elsif heading_level == 2
82
- ## assume season
83
- season = Season.parse( heading )
84
- puts " season >#{heading}< => #{season.key}"
85
- last_season = season ## reset "lowwer levels" - keyword
86
- last_keyword = nil
87
- else
88
- puts "!!! ERROR [club history reader] - for now only heading 1 & 2 supported; sorry"
89
- exit 1
90
- end
91
-
92
- elsif node[0] == :p ## paragraph with (text) lines
93
- if last_country.nil?
94
- puts "!!! ERROR [club history reader] - country heading 1 required, sorry"
95
- exit 1
96
- end
97
- if last_season.nil?
98
- puts "!!! ERROR [club history reader] - season heading 2 required, sorry"
99
- exit 1
100
- end
101
-
102
- lines = node[1]
103
- lines.each do |line|
104
- if m=line.match(KEYWORD_LINE_RE) ## extract keyword and continue
105
- keyword = m[:keyword]
106
- line = m[:text].strip
107
-
108
- puts " keyword #{keyword}"
109
- last_keyword = case keyword ## "normalize" keywords
110
- when 'BANKRUPT', 'BANKRUPTED'
111
- 'BANKRUPT'
112
- when 'RENAME', 'RENAMED'
113
- 'RENAME'
114
- when 'REFORM', 'REFORMED'
115
- 'REFORM'
116
- when 'MOVE', 'MOVED'
117
- 'MOVE'
118
- when 'MERGE', 'MERGED'
119
- 'MERGE'
120
- else
121
- puts "!!! ERROR [club history reader] - unexpected keyword >#{keyword}<; sorry - don't know how to normalize"
122
- exit 1
123
- end
124
-
125
- last_teams = []
126
- end
127
-
128
- if last_keyword.nil?
129
- puts "!!! ERROR [club history reader] - line with keyword expected - got:"
130
- puts line
131
- exit 1
132
- end
133
-
134
- if last_keyword == 'BANKRUPT'
135
- ## requires / expects one team in one line
136
- recs << [ last_keyword, last_season.key,
137
- [ squish(line), last_country.key ]
138
- ]
139
- elsif last_keyword == 'RENAME' ||
140
- last_keyword == 'REFORM' ||
141
- last_keyword == 'MOVE'
142
- ## requires / expects two teams in one line (separated by ⇒ or such)
143
- teams = line.split( '⇒' )
144
- if teams.size != 2
145
- puts "!!! ERROR [club history reader] - expected two teams - got:"
146
- pp teams
147
- exit 1
148
- end
149
- teams = teams.map {|team| squish(team.strip) } ## remove whitespaces
150
- recs << [ last_keyword, last_season.key,
151
- [ teams[0], last_country.key ],
152
- [ teams[1], last_country.key ]
153
- ]
154
- elsif last_keyword == 'MERGE'
155
- ## check if line starts with separator
156
- ## otherwise collect to be merged teams
157
- if line.start_with?( '⇒' )
158
- if last_teams.size < 2
159
- puts "!!! ERROR [club history reader] - expected two or more teams for MERGE - got:"
160
- pp last_teams
161
- exit 1
162
- end
163
- ## auto-add country to all teams
164
- teams = last_teams.map {|team| [team, last_country.key]}
165
- recs << [ last_keyword, last_season.key,
166
- teams,
167
- [ squish(line.sub('⇒','').strip), last_country.key ]
168
- ]
169
-
170
- last_teams = []
171
- else
172
- last_teams << squish(line)
173
- end
174
- else
175
- puts "!!! ERROR [club history reader] - unknown keyword >#{last_keyword}<; cannot process; sorry"
176
- exit 1
177
- end
178
- end # each line (in paragraph)
179
- else
180
- puts "** !!! ERROR [club history reader] - unknown line type:"
181
- pp node
182
- exit 1
183
- end
184
- end
185
-
186
- recs
187
- end # method read
188
-
189
-
190
- ###############
191
- ## helper
192
-
193
- def squish( str )
194
- ## colapse all whitespace to one
195
- str.gsub( /[ ]+/,' ' )
196
- end
197
-
198
-
199
- end # class ClubHistoryReader
200
-
201
-
202
- end ## module Import
203
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubHistoryReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+
14
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
+ parse( txt )
17
+ end
18
+
19
+ def self.parse( txt )
20
+ new( txt ).parse
21
+ end
22
+
23
+ def initialize( txt )
24
+ @txt = txt
25
+ end
26
+
27
+
28
+ ###
29
+ ## RENAME/RENAMED
30
+ ## MOVE/MOVED
31
+ ## BANKRUPT/BANKRUPTED
32
+ ## REFORM/REFORMED
33
+ ## MERGE/MERGED - allow + or ++ or +++ or ; for "inline" - why? why not?
34
+
35
+
36
+ KEYWORD_LINE_RE = %r{ ^(?<keyword>RENAMED?|
37
+ MOVED?|
38
+ BANKRUPT(?:ED)?|
39
+ REFORM(?:ED)?|
40
+ MERGED?
41
+ )
42
+ [ ]+
43
+ (?<text>.*) # rest of text
44
+ $
45
+ }x
46
+
47
+
48
+ def parse
49
+ recs = []
50
+ last_rec = nil
51
+
52
+ last_country = nil
53
+ last_season = nil
54
+ last_keyword = nil
55
+ last_teams = []
56
+
57
+ OutlineReader.parse( @txt ).each do |node|
58
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
59
+ heading_level = node[0][1].to_i
60
+ heading = node[1]
61
+
62
+ puts "heading #{heading_level} >#{heading}<"
63
+
64
+
65
+ if heading_level == 1
66
+ ## assume country in heading; allow all "formats" supported by parse e.g.
67
+ ## Österreich • Austria (at)
68
+ ## Österreich • Austria
69
+ ## Austria
70
+ ## Deutschland (de) • Germany
71
+ country = catalog.countries.parse( heading )
72
+ ## check country code - MUST exist for now!!!!
73
+ if country.nil?
74
+ puts "!!! error [club history reader] - unknown country >#{heading}< - sorry - add country to config to fix"
75
+ exit 1
76
+ end
77
+ puts " country >#{heading}< => #{country.name}, #{country.key}"
78
+ last_country = country
79
+ last_season = nil ## reset "lower levels" - season & keyword
80
+ last_keyword = nil
81
+ elsif heading_level == 2
82
+ ## assume season
83
+ season = Season.parse( heading )
84
+ puts " season >#{heading}< => #{season.key}"
85
+ last_season = season ## reset "lowwer levels" - keyword
86
+ last_keyword = nil
87
+ else
88
+ puts "!!! ERROR [club history reader] - for now only heading 1 & 2 supported; sorry"
89
+ exit 1
90
+ end
91
+
92
+ elsif node[0] == :p ## paragraph with (text) lines
93
+ if last_country.nil?
94
+ puts "!!! ERROR [club history reader] - country heading 1 required, sorry"
95
+ exit 1
96
+ end
97
+ if last_season.nil?
98
+ puts "!!! ERROR [club history reader] - season heading 2 required, sorry"
99
+ exit 1
100
+ end
101
+
102
+ lines = node[1]
103
+ lines.each do |line|
104
+ if m=line.match(KEYWORD_LINE_RE) ## extract keyword and continue
105
+ keyword = m[:keyword]
106
+ line = m[:text].strip
107
+
108
+ puts " keyword #{keyword}"
109
+ last_keyword = case keyword ## "normalize" keywords
110
+ when 'BANKRUPT', 'BANKRUPTED'
111
+ 'BANKRUPT'
112
+ when 'RENAME', 'RENAMED'
113
+ 'RENAME'
114
+ when 'REFORM', 'REFORMED'
115
+ 'REFORM'
116
+ when 'MOVE', 'MOVED'
117
+ 'MOVE'
118
+ when 'MERGE', 'MERGED'
119
+ 'MERGE'
120
+ else
121
+ puts "!!! ERROR [club history reader] - unexpected keyword >#{keyword}<; sorry - don't know how to normalize"
122
+ exit 1
123
+ end
124
+
125
+ last_teams = []
126
+ end
127
+
128
+ if last_keyword.nil?
129
+ puts "!!! ERROR [club history reader] - line with keyword expected - got:"
130
+ puts line
131
+ exit 1
132
+ end
133
+
134
+ if last_keyword == 'BANKRUPT'
135
+ ## requires / expects one team in one line
136
+ recs << [ last_keyword, last_season.key,
137
+ [ squish(line), last_country.key ]
138
+ ]
139
+ elsif last_keyword == 'RENAME' ||
140
+ last_keyword == 'REFORM' ||
141
+ last_keyword == 'MOVE'
142
+ ## requires / expects two teams in one line (separated by ⇒ or such)
143
+ teams = line.split( '⇒' )
144
+ if teams.size != 2
145
+ puts "!!! ERROR [club history reader] - expected two teams - got:"
146
+ pp teams
147
+ exit 1
148
+ end
149
+ teams = teams.map {|team| squish(team.strip) } ## remove whitespaces
150
+ recs << [ last_keyword, last_season.key,
151
+ [ teams[0], last_country.key ],
152
+ [ teams[1], last_country.key ]
153
+ ]
154
+ elsif last_keyword == 'MERGE'
155
+ ## check if line starts with separator
156
+ ## otherwise collect to be merged teams
157
+ if line.start_with?( '⇒' )
158
+ if last_teams.size < 2
159
+ puts "!!! ERROR [club history reader] - expected two or more teams for MERGE - got:"
160
+ pp last_teams
161
+ exit 1
162
+ end
163
+ ## auto-add country to all teams
164
+ teams = last_teams.map {|team| [team, last_country.key]}
165
+ recs << [ last_keyword, last_season.key,
166
+ teams,
167
+ [ squish(line.sub('⇒','').strip), last_country.key ]
168
+ ]
169
+
170
+ last_teams = []
171
+ else
172
+ last_teams << squish(line)
173
+ end
174
+ else
175
+ puts "!!! ERROR [club history reader] - unknown keyword >#{last_keyword}<; cannot process; sorry"
176
+ exit 1
177
+ end
178
+ end # each line (in paragraph)
179
+ else
180
+ puts "** !!! ERROR [club history reader] - unknown line type:"
181
+ pp node
182
+ exit 1
183
+ end
184
+ end
185
+
186
+ recs
187
+ end # method read
188
+
189
+
190
+ ###############
191
+ ## helper
192
+
193
+ def squish( str )
194
+ ## colapse all whitespace to one
195
+ str.gsub( /[ ]+/,' ' )
196
+ end
197
+
198
+
199
+ end # class ClubHistoryReader
200
+
201
+
202
+ end ## module Import
203
+ end ## module SportDb
@@ -1,108 +1,108 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class WikiReader ## todo/check: rename to WikiClubReader - why? why not?
9
-
10
- class WikiClub # nested class
11
- attr_reader :name, :country
12
- def initialize( name, country )
13
- @name, @country = name, country
14
- end
15
- end # (nested) class WikiClub
16
-
17
-
18
- def catalog() Import.catalog; end
19
-
20
-
21
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
22
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
23
- parse( txt )
24
- end
25
-
26
- def self.parse( txt )
27
- new( txt ).parse
28
- end
29
-
30
- def initialize( txt )
31
- @txt = txt
32
- end
33
-
34
- def parse
35
- recs = []
36
- last_country = nil ## note: supports only one level of headings for now (and that is a country)
37
-
38
- @txt.each_line do |line|
39
- line = line.strip
40
-
41
- next if line.empty?
42
- next if line.start_with?( '#' ) ## skip comments too
43
-
44
- ## strip inline (until end-of-line) comments too
45
- ## e.g Eupen => KAS Eupen, ## [de]
46
- ## => Eupen => KAS Eupen,
47
- line = line.sub( /#.*/, '' ).strip
48
- pp line
49
-
50
-
51
- next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
52
-
53
- ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
54
- ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
55
- if line =~ /^(={1,}) ## leading ======
56
- ([^=]+?) ## text (note: for now no "inline" = allowed)
57
- =* ## (optional) trailing ====
58
- $/x
59
- heading_marker = $1
60
- heading_level = $1.length ## count number of = for heading level
61
- heading = $2.strip
62
-
63
- puts "heading #{heading_level} >#{heading}<"
64
-
65
- if heading_level > 1
66
- puts "** !!! ERROR [wiki reader] !!! - - headings level too deep - only top / one level supported for now; sorry"
67
- exit 1
68
- end
69
-
70
- ## assume country in heading; allow all "formats" supported by parse e.g.
71
- ## Österreich • Austria (at)
72
- ## Österreich • Austria
73
- ## Austria
74
- ## Deutschland (de) • Germany
75
- country = catalog.countries.parse( heading )
76
- ## check country code - MUST exist for now!!!!
77
- if country.nil?
78
- puts "!!! error [wiki reader] - unknown country >#{heading}< - sorry - add country to config to fix"
79
- exit 1
80
- end
81
-
82
- last_country = country
83
- pp last_country
84
- else
85
- ## strip and squish (white)spaces
86
- # e.g. New York FC (2011-) => New York FC (2011-)
87
- value = line.strip.gsub( /[ \t]+/, ' ' )
88
-
89
- ## normalize (allow underscore (-) - replace with space)
90
- ## e.g. Cercle_Brugge_K.S.V. => Cercle Brugge K.S.V.
91
- value = value.gsub( '_', ' ' )
92
-
93
- if last_country.nil?
94
- puts "** !!! ERROR [wiki reader] !!! - country heading missing for club name; sorry - add country heading to fix"
95
- exit 1
96
- end
97
-
98
- rec = WikiClub.new( value, last_country )
99
- recs << rec
100
- end
101
- end # each_line
102
- recs
103
- end # method read
104
-
105
- end # class WikiReader
106
-
107
- end ## module Import
108
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class WikiReader ## todo/check: rename to WikiClubReader - why? why not?
9
+
10
+ class WikiClub # nested class
11
+ attr_reader :name, :country
12
+ def initialize( name, country )
13
+ @name, @country = name, country
14
+ end
15
+ end # (nested) class WikiClub
16
+
17
+
18
+ def catalog() Import.catalog; end
19
+
20
+
21
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
22
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
23
+ parse( txt )
24
+ end
25
+
26
+ def self.parse( txt )
27
+ new( txt ).parse
28
+ end
29
+
30
+ def initialize( txt )
31
+ @txt = txt
32
+ end
33
+
34
+ def parse
35
+ recs = []
36
+ last_country = nil ## note: supports only one level of headings for now (and that is a country)
37
+
38
+ @txt.each_line do |line|
39
+ line = line.strip
40
+
41
+ next if line.empty?
42
+ next if line.start_with?( '#' ) ## skip comments too
43
+
44
+ ## strip inline (until end-of-line) comments too
45
+ ## e.g Eupen => KAS Eupen, ## [de]
46
+ ## => Eupen => KAS Eupen,
47
+ line = line.sub( /#.*/, '' ).strip
48
+ pp line
49
+
50
+
51
+ next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
52
+
53
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
54
+ ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
55
+ if line =~ /^(={1,}) ## leading ======
56
+ ([^=]+?) ## text (note: for now no "inline" = allowed)
57
+ =* ## (optional) trailing ====
58
+ $/x
59
+ heading_marker = $1
60
+ heading_level = $1.length ## count number of = for heading level
61
+ heading = $2.strip
62
+
63
+ puts "heading #{heading_level} >#{heading}<"
64
+
65
+ if heading_level > 1
66
+ puts "** !!! ERROR [wiki reader] !!! - - headings level too deep - only top / one level supported for now; sorry"
67
+ exit 1
68
+ end
69
+
70
+ ## assume country in heading; allow all "formats" supported by parse e.g.
71
+ ## Österreich • Austria (at)
72
+ ## Österreich • Austria
73
+ ## Austria
74
+ ## Deutschland (de) • Germany
75
+ country = catalog.countries.parse( heading )
76
+ ## check country code - MUST exist for now!!!!
77
+ if country.nil?
78
+ puts "!!! error [wiki reader] - unknown country >#{heading}< - sorry - add country to config to fix"
79
+ exit 1
80
+ end
81
+
82
+ last_country = country
83
+ pp last_country
84
+ else
85
+ ## strip and squish (white)spaces
86
+ # e.g. New York FC (2011-) => New York FC (2011-)
87
+ value = line.strip.gsub( /[ \t]+/, ' ' )
88
+
89
+ ## normalize (allow underscore (-) - replace with space)
90
+ ## e.g. Cercle_Brugge_K.S.V. => Cercle Brugge K.S.V.
91
+ value = value.gsub( '_', ' ' )
92
+
93
+ if last_country.nil?
94
+ puts "** !!! ERROR [wiki reader] !!! - country heading missing for club name; sorry - add country heading to fix"
95
+ exit 1
96
+ end
97
+
98
+ rec = WikiClub.new( value, last_country )
99
+ recs << rec
100
+ end
101
+ end # each_line
102
+ recs
103
+ end # method read
104
+
105
+ end # class WikiReader
106
+
107
+ end ## module Import
108
+ end ## module SportDb