sportdb-formats 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sportdb/formats/country/country_reader.rb +142 -142
- data/lib/sportdb/formats/datafile.rb +59 -59
- data/lib/sportdb/formats/event/event_index.rb +141 -141
- data/lib/sportdb/formats/event/event_reader.rb +183 -183
- data/lib/sportdb/formats/league/league_outline_reader.rb +1 -0
- data/lib/sportdb/formats/league/league_reader.rb +168 -168
- data/lib/sportdb/formats/match/match_parser_auto_conf.rb +202 -202
- data/lib/sportdb/formats/package.rb +374 -374
- data/lib/sportdb/formats/team/club_index_history.rb +134 -134
- data/lib/sportdb/formats/team/club_reader.rb +350 -350
- data/lib/sportdb/formats/team/club_reader_history.rb +203 -203
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -108
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/test_club_index_history.rb +107 -107
- data/test/test_club_reader.rb +201 -201
- data/test/test_club_reader_history.rb +212 -212
- data/test/test_country_reader.rb +89 -89
- data/test/test_league_outline_reader.rb +55 -55
- data/test/test_league_reader.rb +72 -72
- data/test/test_outline_reader.rb +31 -31
- data/test/test_regex.rb +67 -67
- data/test/test_wiki_reader.rb +77 -77
- metadata +12 -6
@@ -1,134 +1,134 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
module Import
|
5
|
-
|
6
|
-
|
7
|
-
class ClubHistoryIndex
|
8
|
-
|
9
|
-
def self.build( path )
|
10
|
-
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
11
|
-
|
12
|
-
recs = []
|
13
|
-
pack.each_clubs_history do |entry|
|
14
|
-
recs += ClubHistoryReader.parse( entry.read )
|
15
|
-
end
|
16
|
-
recs
|
17
|
-
|
18
|
-
index = new
|
19
|
-
index.add( recs )
|
20
|
-
index
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
def catalog() Import.catalog; end
|
26
|
-
|
27
|
-
## note: keep name history for now separate from
|
28
|
-
## from club struct - why? why not?
|
29
|
-
## later yes, yes, yes, merge name history into club struct!!!!!
|
30
|
-
##
|
31
|
-
## for now the name history is experimental
|
32
|
-
|
33
|
-
|
34
|
-
def initialize
|
35
|
-
@clubs = {} ## clubs (indexed) by canonical name
|
36
|
-
@errors = []
|
37
|
-
end
|
38
|
-
|
39
|
-
attr_reader :errors
|
40
|
-
def errors?() @errors.empty? == false; end
|
41
|
-
|
42
|
-
def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
|
43
|
-
|
44
|
-
|
45
|
-
def add_history( club_rec, keyword, season, args )
|
46
|
-
## note use season obj for now (and NOT key) - why? why not?
|
47
|
-
rec = @clubs[ club_rec.name ] ||= []
|
48
|
-
|
49
|
-
rec << [season, [keyword, args]]
|
50
|
-
|
51
|
-
## note: always keep records sorted by season_key for now
|
52
|
-
## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
|
53
|
-
rec.sort! { |l,r| r[0] <=> l[0] }
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
def add( rec_or_recs ) ## add club record / alt_names
|
58
|
-
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
59
|
-
|
60
|
-
recs.each do |rec|
|
61
|
-
|
62
|
-
keyword = rec[0]
|
63
|
-
season_key = rec[1]
|
64
|
-
args = rec[2..-1] ## get rest of args e.g. one, two or more
|
65
|
-
|
66
|
-
## note: for now only add (re)name history season records,
|
67
|
-
## that is, skip MERGE and BANKRUPT for now
|
68
|
-
## and incl. only RENAME, REFORM, MOVE for now
|
69
|
-
next if ['MERGE', 'BANKRUPT'].include?( keyword )
|
70
|
-
|
71
|
-
|
72
|
-
name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
|
73
|
-
name_new = strip_geo( args[1][0] )
|
74
|
-
|
75
|
-
country_old = args[0][1]
|
76
|
-
country_new = args[1][1]
|
77
|
-
|
78
|
-
club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
|
79
|
-
club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
|
80
|
-
|
81
|
-
## note use season obj for now (and NOT key) - why? why not?
|
82
|
-
season = Season.parse( season_key )
|
83
|
-
|
84
|
-
## todo/check:
|
85
|
-
## check if club_old and club_new reference different club record!!
|
86
|
-
## examples - RB II -> Liefering ?? or
|
87
|
-
## FC Pasching -> OOE Juniors ??
|
88
|
-
## Austria Salzburg -> RB Salburg ??
|
89
|
-
## for now always add name history to both - why? why not?
|
90
|
-
|
91
|
-
add_history( club_old, keyword, season, args )
|
92
|
-
## note: allow for now different club references
|
93
|
-
## but maybe warn later - why? why not?
|
94
|
-
## add history to both for now
|
95
|
-
add_history( club_new, keyword, season, args ) if club_old != club_new
|
96
|
-
end # each rec
|
97
|
-
end # method add
|
98
|
-
|
99
|
-
|
100
|
-
#### todo/check: move as method to club struct later - to always use club reference
|
101
|
-
## returns (simply) name as string for now or nil - why? why not?
|
102
|
-
#
|
103
|
-
# history entry example
|
104
|
-
# Arsenal FC"=>
|
105
|
-
# [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
|
106
|
-
# [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
|
107
|
-
# [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
|
108
|
-
def find_name_by( name:, season: )
|
109
|
-
recs = @clubs[ name ]
|
110
|
-
if recs
|
111
|
-
season = Season( season ) ## make sure season is a season obj (and NOT a string)
|
112
|
-
## check season records for name; use linear search (assume only few records)
|
113
|
-
recs.each do |rec|
|
114
|
-
if season >= rec[0]
|
115
|
-
return strip_geo( rec[1][1][1][0] ) # use second arg
|
116
|
-
end
|
117
|
-
end
|
118
|
-
## if we get here use last name
|
119
|
-
strip_geo( recs[-1][1][1][0][0] ) # use first arg
|
120
|
-
else
|
121
|
-
nil
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
##################
|
126
|
-
## helpers
|
127
|
-
def strip_geo( name )
|
128
|
-
## e.g. Arsenal, London => Arsenal
|
129
|
-
name.split(',')[0].strip
|
130
|
-
end
|
131
|
-
end # class ClubHistoryIndex
|
132
|
-
|
133
|
-
end # module Import
|
134
|
-
end # module SportDb
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
|
7
|
+
class ClubHistoryIndex
|
8
|
+
|
9
|
+
def self.build( path )
|
10
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
11
|
+
|
12
|
+
recs = []
|
13
|
+
pack.each_clubs_history do |entry|
|
14
|
+
recs += ClubHistoryReader.parse( entry.read )
|
15
|
+
end
|
16
|
+
recs
|
17
|
+
|
18
|
+
index = new
|
19
|
+
index.add( recs )
|
20
|
+
index
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
def catalog() Import.catalog; end
|
26
|
+
|
27
|
+
## note: keep name history for now separate from
|
28
|
+
## from club struct - why? why not?
|
29
|
+
## later yes, yes, yes, merge name history into club struct!!!!!
|
30
|
+
##
|
31
|
+
## for now the name history is experimental
|
32
|
+
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@clubs = {} ## clubs (indexed) by canonical name
|
36
|
+
@errors = []
|
37
|
+
end
|
38
|
+
|
39
|
+
attr_reader :errors
|
40
|
+
def errors?() @errors.empty? == false; end
|
41
|
+
|
42
|
+
def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
|
43
|
+
|
44
|
+
|
45
|
+
def add_history( club_rec, keyword, season, args )
|
46
|
+
## note use season obj for now (and NOT key) - why? why not?
|
47
|
+
rec = @clubs[ club_rec.name ] ||= []
|
48
|
+
|
49
|
+
rec << [season, [keyword, args]]
|
50
|
+
|
51
|
+
## note: always keep records sorted by season_key for now
|
52
|
+
## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
|
53
|
+
rec.sort! { |l,r| r[0] <=> l[0] }
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
58
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
59
|
+
|
60
|
+
recs.each do |rec|
|
61
|
+
|
62
|
+
keyword = rec[0]
|
63
|
+
season_key = rec[1]
|
64
|
+
args = rec[2..-1] ## get rest of args e.g. one, two or more
|
65
|
+
|
66
|
+
## note: for now only add (re)name history season records,
|
67
|
+
## that is, skip MERGE and BANKRUPT for now
|
68
|
+
## and incl. only RENAME, REFORM, MOVE for now
|
69
|
+
next if ['MERGE', 'BANKRUPT'].include?( keyword )
|
70
|
+
|
71
|
+
|
72
|
+
name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
|
73
|
+
name_new = strip_geo( args[1][0] )
|
74
|
+
|
75
|
+
country_old = args[0][1]
|
76
|
+
country_new = args[1][1]
|
77
|
+
|
78
|
+
club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
|
79
|
+
club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
|
80
|
+
|
81
|
+
## note use season obj for now (and NOT key) - why? why not?
|
82
|
+
season = Season.parse( season_key )
|
83
|
+
|
84
|
+
## todo/check:
|
85
|
+
## check if club_old and club_new reference different club record!!
|
86
|
+
## examples - RB II -> Liefering ?? or
|
87
|
+
## FC Pasching -> OOE Juniors ??
|
88
|
+
## Austria Salzburg -> RB Salburg ??
|
89
|
+
## for now always add name history to both - why? why not?
|
90
|
+
|
91
|
+
add_history( club_old, keyword, season, args )
|
92
|
+
## note: allow for now different club references
|
93
|
+
## but maybe warn later - why? why not?
|
94
|
+
## add history to both for now
|
95
|
+
add_history( club_new, keyword, season, args ) if club_old != club_new
|
96
|
+
end # each rec
|
97
|
+
end # method add
|
98
|
+
|
99
|
+
|
100
|
+
#### todo/check: move as method to club struct later - to always use club reference
|
101
|
+
## returns (simply) name as string for now or nil - why? why not?
|
102
|
+
#
|
103
|
+
# history entry example
|
104
|
+
# Arsenal FC"=>
|
105
|
+
# [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
|
106
|
+
# [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
|
107
|
+
# [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
|
108
|
+
def find_name_by( name:, season: )
|
109
|
+
recs = @clubs[ name ]
|
110
|
+
if recs
|
111
|
+
season = Season( season ) ## make sure season is a season obj (and NOT a string)
|
112
|
+
## check season records for name; use linear search (assume only few records)
|
113
|
+
recs.each do |rec|
|
114
|
+
if season >= rec[0]
|
115
|
+
return strip_geo( rec[1][1][1][0] ) # use second arg
|
116
|
+
end
|
117
|
+
end
|
118
|
+
## if we get here use last name
|
119
|
+
strip_geo( recs[-1][1][1][0][0] ) # use first arg
|
120
|
+
else
|
121
|
+
nil
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
##################
|
126
|
+
## helpers
|
127
|
+
def strip_geo( name )
|
128
|
+
## e.g. Arsenal, London => Arsenal
|
129
|
+
name.split(',')[0].strip
|
130
|
+
end
|
131
|
+
end # class ClubHistoryIndex
|
132
|
+
|
133
|
+
end # module Import
|
134
|
+
end # module SportDb
|
@@ -1,350 +1,350 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module SportDb
|
5
|
-
module Import
|
6
|
-
|
7
|
-
|
8
|
-
class ClubReader
|
9
|
-
|
10
|
-
def catalog() Import.catalog; end
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
15
|
-
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
16
|
-
parse( txt )
|
17
|
-
end
|
18
|
-
|
19
|
-
def self.parse( txt )
|
20
|
-
new( txt ).parse
|
21
|
-
end
|
22
|
-
|
23
|
-
def initialize( txt )
|
24
|
-
@txt = txt
|
25
|
-
end
|
26
|
-
|
27
|
-
## pattern for b (child) team / club marker e.g.
|
28
|
-
## (ii) or ii) or ii.) or (ii.) or (II)
|
29
|
-
## (b) or b) or b.) or (b.) or (B)
|
30
|
-
## (2) or 2) or 2.) or (2.)
|
31
|
-
B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
|
32
|
-
(?: ii|b|2 )
|
33
|
-
\.? # optional dot - keep and allow dot - why? why not?
|
34
|
-
\) # required closing bracket
|
35
|
-
}xi ## note: add case-insenstive (e.g. II/ii or B/b)
|
36
|
-
|
37
|
-
## pattern for checking for address line e.g.
|
38
|
-
## use just one style / syntax - why? why not?
|
39
|
-
## Fischhofgasse 12 ~ 1100 Wien or
|
40
|
-
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
41
|
-
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
42
|
-
ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
|
43
|
-
(?: ~ | /{2,} | \+{2,} )
|
44
|
-
(?: [ ]|$) # space or end of line
|
45
|
-
}x
|
46
|
-
|
47
|
-
|
48
|
-
def add_alt_names( rec, names ) ## helper for adding alternat names
|
49
|
-
|
50
|
-
## strip and squish (white)spaces
|
51
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
52
|
-
names = names.map { |name| name.gsub( '$', '' ).strip
|
53
|
-
.gsub( /[ \t]+/, ' ' ) }
|
54
|
-
rec.alt_names += names
|
55
|
-
rec.add_variants( names ) # auto-add (possible) auto-generated variant names
|
56
|
-
|
57
|
-
## check for duplicates
|
58
|
-
if rec.duplicates?
|
59
|
-
duplicates = rec.duplicates
|
60
|
-
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
61
|
-
pp duplicates
|
62
|
-
pp rec
|
63
|
-
##
|
64
|
-
## todo/fix: make it only an error with exit 1
|
65
|
-
## if (not normalized) names are the same (not unique/uniq)
|
66
|
-
## e.g. don't exit on A.F.C. == AFC etc.
|
67
|
-
## exit 1
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
def parse
|
73
|
-
recs = []
|
74
|
-
last_rec = nil
|
75
|
-
headings = [] ## headings stack
|
76
|
-
|
77
|
-
OutlineReader.parse( @txt ).each do |node|
|
78
|
-
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
79
|
-
heading_level = node[0][1].to_i
|
80
|
-
heading = node[1]
|
81
|
-
|
82
|
-
puts "heading #{heading_level} >#{heading}<"
|
83
|
-
|
84
|
-
## 1) first pop headings if present
|
85
|
-
while headings.size+1 > heading_level
|
86
|
-
headings.pop
|
87
|
-
end
|
88
|
-
|
89
|
-
## 2) add missing (hierarchy) level if
|
90
|
-
while headings.size+1 < heading_level
|
91
|
-
## todo/fix: issue warning about "skipping" hierarchy level
|
92
|
-
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
93
|
-
headings.push( nil )
|
94
|
-
end
|
95
|
-
|
96
|
-
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
97
|
-
## keep level empty
|
98
|
-
else
|
99
|
-
## note: if level is 1 assume country for now
|
100
|
-
if heading_level == 1
|
101
|
-
## assume country in heading; allow all "formats" supported by parse e.g.
|
102
|
-
## Österreich • Austria (at)
|
103
|
-
## Österreich • Austria
|
104
|
-
## Austria
|
105
|
-
## Deutschland (de) • Germany
|
106
|
-
country = catalog.countries.parse( heading )
|
107
|
-
## check country code - MUST exist for now!!!!
|
108
|
-
if country.nil?
|
109
|
-
puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
110
|
-
exit 1
|
111
|
-
end
|
112
|
-
|
113
|
-
headings.push( country.key )
|
114
|
-
else
|
115
|
-
## quick hack:
|
116
|
-
## remove known fill/dummy words incl:
|
117
|
-
## Provincia San Juan => San Juan (see argentina, for example)
|
118
|
-
##
|
119
|
-
## use geo tree long term with alternative names - why? why not?
|
120
|
-
words = ['Provincia']
|
121
|
-
words.each { |word| heading = heading.gsub( word, '' ) }
|
122
|
-
heading = heading.strip
|
123
|
-
|
124
|
-
headings.push( heading )
|
125
|
-
end
|
126
|
-
|
127
|
-
## assert that hierarchy level is ok
|
128
|
-
if headings.size != heading_level
|
129
|
-
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
130
|
-
exit 1
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
pp headings
|
135
|
-
|
136
|
-
elsif node[0] == :p ## paragraph with (text) lines
|
137
|
-
lines = node[1]
|
138
|
-
lines.each do |line|
|
139
|
-
if line.start_with?( '|' )
|
140
|
-
## assume continuation with line of alternative names
|
141
|
-
## note: skip leading pipe
|
142
|
-
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
143
|
-
|
144
|
-
add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
|
145
|
-
|
146
|
-
## check for b (child) team / club marker e.g.
|
147
|
-
## (ii) or ii) or ii.) or (ii.)
|
148
|
-
## (b) or b) or b.) or (b.)
|
149
|
-
## (2) or 2) or 2.) or (2.)
|
150
|
-
elsif line =~ B_TEAM_MARKER_RE
|
151
|
-
line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
|
152
|
-
|
153
|
-
## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
|
154
|
-
# for both a and b team / club
|
155
|
-
rec = Club.new
|
156
|
-
value = line ## note: assume / allow just canonical name for now
|
157
|
-
## strip and squish (white)spaces
|
158
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
159
|
-
value = value.gsub( '$', '' ).strip
|
160
|
-
.gsub( /[ \t]+/, ' ' )
|
161
|
-
|
162
|
-
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
163
|
-
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
164
|
-
|
165
|
-
### link a and b team / clubs
|
166
|
-
## assume last_rec is the a team
|
167
|
-
## todo/fix: check last_rec required NOT null
|
168
|
-
rec.a = last_rec
|
169
|
-
last_rec.b = rec
|
170
|
-
|
171
|
-
last_rec = rec
|
172
|
-
recs << rec
|
173
|
-
|
174
|
-
## check for address line e.g.
|
175
|
-
## use just one style / syntax - why? why not?
|
176
|
-
## Fischhofgasse 12 ~ 1100 Wien or
|
177
|
-
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
178
|
-
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
179
|
-
elsif line =~ ADDR_MARKER_RE
|
180
|
-
# note skip for now!!!
|
181
|
-
# todo/fix: add support for address line!!!
|
182
|
-
puts " skipping address line for now >#{line}<"
|
183
|
-
else
|
184
|
-
values = line.split( ',' )
|
185
|
-
|
186
|
-
rec = Club.new
|
187
|
-
|
188
|
-
col = values.shift ## get first item
|
189
|
-
## note: allow optional alt names for convenience with required canoncial name
|
190
|
-
names = col.split( '|' ) # team names - allow/use pipe(|)
|
191
|
-
value = names[0] ## canonical name
|
192
|
-
alt_names = names[1..-1] ## optional (inline) alt names
|
193
|
-
|
194
|
-
## strip and squish (white)spaces
|
195
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
196
|
-
value = value.gsub( '$', '' ).strip
|
197
|
-
.gsub( /[ \t]+/, ' ' )
|
198
|
-
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
199
|
-
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
200
|
-
|
201
|
-
## note: add optional (inline) alternate names if present
|
202
|
-
add_alt_names( rec, alt_names ) if alt_names.size > 0
|
203
|
-
|
204
|
-
## note:
|
205
|
-
## check/todo!!!!!!!!!!!!!!!!!-
|
206
|
-
## strip year if to present e.g. (2011-)
|
207
|
-
##
|
208
|
-
## do NOT strip for defunct / historic clubs e.g.
|
209
|
-
## (1899-1910)
|
210
|
-
## or (-1914) or (-2011) etc.
|
211
|
-
|
212
|
-
###
|
213
|
-
## todo: move year out of canonical team name - why? why not?
|
214
|
-
|
215
|
-
## check if canonical name include (2011-) or similar in name
|
216
|
-
## if yes, remove (2011-) and add to (alt) names
|
217
|
-
## e.g. New York FC (2011) => New York FC
|
218
|
-
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
219
|
-
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
220
|
-
|
221
|
-
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
222
|
-
rec.year = $1.to_i
|
223
|
-
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
224
|
-
rec.year_end = $1.to_i
|
225
|
-
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
226
|
-
rec.year = $1.to_i
|
227
|
-
rec.year_end = $2.to_i
|
228
|
-
else
|
229
|
-
## todo/check: warn about unknown year format
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
## todo/check - check for unknown format values
|
234
|
-
## e.g. too many values, duplicate years, etc.
|
235
|
-
## check for overwritting, etc.
|
236
|
-
while values.size > 0
|
237
|
-
value = values.shift
|
238
|
-
## strip and squish (white)spaces
|
239
|
-
# e.g. León › Guanajuato => León › Guanajuato
|
240
|
-
value = value.strip.gsub( /[ \t]+/, ' ' )
|
241
|
-
if value =~/^\d{4}$/ # e.g 1904
|
242
|
-
## todo/check: issue warning if year is already set!!!!!!!
|
243
|
-
if rec.year
|
244
|
-
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
245
|
-
pp rec
|
246
|
-
exit 1
|
247
|
-
end
|
248
|
-
rec.year = value.to_i
|
249
|
-
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
250
|
-
## cut-off leading @ and spaces
|
251
|
-
rec.ground = value[1..-1].strip
|
252
|
-
else
|
253
|
-
## assume city / geo tree
|
254
|
-
## split into geo tree
|
255
|
-
geos = split_geo( value )
|
256
|
-
city = geos[0]
|
257
|
-
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
258
|
-
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
259
|
-
rec.district = $1.strip
|
260
|
-
city = city.gsub( /\(.+?\)/, '' ).strip
|
261
|
-
end
|
262
|
-
rec.city = city
|
263
|
-
|
264
|
-
if geos.size > 1
|
265
|
-
## cut-off city and keep the rest (of geo tree)
|
266
|
-
rec.geos = geos[1..-1]
|
267
|
-
end
|
268
|
-
end
|
269
|
-
end ## while values
|
270
|
-
|
271
|
-
|
272
|
-
###############
|
273
|
-
## use headings text for geo tree
|
274
|
-
|
275
|
-
## 1) add country if present
|
276
|
-
if headings.size > 0 && headings[0]
|
277
|
-
country = catalog.countries.find( headings[0] )
|
278
|
-
rec.country = country
|
279
|
-
else
|
280
|
-
## make it an error - why? why not?
|
281
|
-
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
282
|
-
exit 1
|
283
|
-
end
|
284
|
-
|
285
|
-
## 2) check geo tree with headings hierarchy
|
286
|
-
if headings.size > 1 && headings[1]
|
287
|
-
geos = split_geo( headings[1] )
|
288
|
-
if rec.geos
|
289
|
-
if rec.geos[0] != geos[0]
|
290
|
-
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
291
|
-
exit 1
|
292
|
-
end
|
293
|
-
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
294
|
-
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
295
|
-
exit 1
|
296
|
-
end
|
297
|
-
else
|
298
|
-
## add missing region (state/province) from headings hierarchy
|
299
|
-
rec.geos = geos
|
300
|
-
end
|
301
|
-
end
|
302
|
-
|
303
|
-
last_rec = rec
|
304
|
-
|
305
|
-
|
306
|
-
### todo/fix:
|
307
|
-
## auto-add alt name with dots stripped - why? why not?
|
308
|
-
## e.g. D.C. United => DC United
|
309
|
-
## e.g. Liverpool F.C. => Liverpool FC
|
310
|
-
## e.g. St. Albin => St Albin etc.
|
311
|
-
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
312
|
-
|
313
|
-
##
|
314
|
-
## todo/fix: unify mapping entries
|
315
|
-
## always lowercase !!!! (case insensitive)
|
316
|
-
## always strip (2011-) !!!
|
317
|
-
## always strip dots (e.g. St., F.C, etc.)
|
318
|
-
|
319
|
-
recs << rec
|
320
|
-
end
|
321
|
-
end # each line (in paragraph)
|
322
|
-
else
|
323
|
-
puts "** !!! ERROR !!! [club reader] - unknown line type:"
|
324
|
-
pp node
|
325
|
-
exit 1
|
326
|
-
end
|
327
|
-
end
|
328
|
-
|
329
|
-
recs
|
330
|
-
end # method read
|
331
|
-
|
332
|
-
#######################
|
333
|
-
### helpers
|
334
|
-
def split_geo( str )
|
335
|
-
## assume city / geo tree
|
336
|
-
## strip and squish (white)spaces
|
337
|
-
# e.g. León › Guanajuato => León › Guanajuato
|
338
|
-
str = str.strip.gsub( /[ \t]+/, ' ' )
|
339
|
-
|
340
|
-
## split into geo tree
|
341
|
-
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
342
|
-
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
343
|
-
geos
|
344
|
-
end
|
345
|
-
|
346
|
-
end # class ClubReader
|
347
|
-
|
348
|
-
|
349
|
-
end ## module Import
|
350
|
-
end ## module SportDb
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class ClubReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
15
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
16
|
+
parse( txt )
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse( txt )
|
20
|
+
new( txt ).parse
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize( txt )
|
24
|
+
@txt = txt
|
25
|
+
end
|
26
|
+
|
27
|
+
## pattern for b (child) team / club marker e.g.
|
28
|
+
## (ii) or ii) or ii.) or (ii.) or (II)
|
29
|
+
## (b) or b) or b.) or (b.) or (B)
|
30
|
+
## (2) or 2) or 2.) or (2.)
|
31
|
+
B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
|
32
|
+
(?: ii|b|2 )
|
33
|
+
\.? # optional dot - keep and allow dot - why? why not?
|
34
|
+
\) # required closing bracket
|
35
|
+
}xi ## note: add case-insenstive (e.g. II/ii or B/b)
|
36
|
+
|
37
|
+
## pattern for checking for address line e.g.
|
38
|
+
## use just one style / syntax - why? why not?
|
39
|
+
## Fischhofgasse 12 ~ 1100 Wien or
|
40
|
+
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
41
|
+
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
42
|
+
ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
|
43
|
+
(?: ~ | /{2,} | \+{2,} )
|
44
|
+
(?: [ ]|$) # space or end of line
|
45
|
+
}x
|
46
|
+
|
47
|
+
|
48
|
+
def add_alt_names( rec, names ) ## helper for adding alternat names
|
49
|
+
|
50
|
+
## strip and squish (white)spaces
|
51
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
52
|
+
names = names.map { |name| name.gsub( '$', '' ).strip
|
53
|
+
.gsub( /[ \t]+/, ' ' ) }
|
54
|
+
rec.alt_names += names
|
55
|
+
rec.add_variants( names ) # auto-add (possible) auto-generated variant names
|
56
|
+
|
57
|
+
## check for duplicates
|
58
|
+
if rec.duplicates?
|
59
|
+
duplicates = rec.duplicates
|
60
|
+
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
61
|
+
pp duplicates
|
62
|
+
pp rec
|
63
|
+
##
|
64
|
+
## todo/fix: make it only an error with exit 1
|
65
|
+
## if (not normalized) names are the same (not unique/uniq)
|
66
|
+
## e.g. don't exit on A.F.C. == AFC etc.
|
67
|
+
## exit 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def parse
|
73
|
+
recs = []
|
74
|
+
last_rec = nil
|
75
|
+
headings = [] ## headings stack
|
76
|
+
|
77
|
+
OutlineReader.parse( @txt ).each do |node|
|
78
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
79
|
+
heading_level = node[0][1].to_i
|
80
|
+
heading = node[1]
|
81
|
+
|
82
|
+
puts "heading #{heading_level} >#{heading}<"
|
83
|
+
|
84
|
+
## 1) first pop headings if present
|
85
|
+
while headings.size+1 > heading_level
|
86
|
+
headings.pop
|
87
|
+
end
|
88
|
+
|
89
|
+
## 2) add missing (hierarchy) level if
|
90
|
+
while headings.size+1 < heading_level
|
91
|
+
## todo/fix: issue warning about "skipping" hierarchy level
|
92
|
+
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
93
|
+
headings.push( nil )
|
94
|
+
end
|
95
|
+
|
96
|
+
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
97
|
+
## keep level empty
|
98
|
+
else
|
99
|
+
## note: if level is 1 assume country for now
|
100
|
+
if heading_level == 1
|
101
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
102
|
+
## Österreich • Austria (at)
|
103
|
+
## Österreich • Austria
|
104
|
+
## Austria
|
105
|
+
## Deutschland (de) • Germany
|
106
|
+
country = catalog.countries.parse( heading )
|
107
|
+
## check country code - MUST exist for now!!!!
|
108
|
+
if country.nil?
|
109
|
+
puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
110
|
+
exit 1
|
111
|
+
end
|
112
|
+
|
113
|
+
headings.push( country.key )
|
114
|
+
else
|
115
|
+
## quick hack:
|
116
|
+
## remove known fill/dummy words incl:
|
117
|
+
## Provincia San Juan => San Juan (see argentina, for example)
|
118
|
+
##
|
119
|
+
## use geo tree long term with alternative names - why? why not?
|
120
|
+
words = ['Provincia']
|
121
|
+
words.each { |word| heading = heading.gsub( word, '' ) }
|
122
|
+
heading = heading.strip
|
123
|
+
|
124
|
+
headings.push( heading )
|
125
|
+
end
|
126
|
+
|
127
|
+
## assert that hierarchy level is ok
|
128
|
+
if headings.size != heading_level
|
129
|
+
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
130
|
+
exit 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
pp headings
|
135
|
+
|
136
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
137
|
+
lines = node[1]
|
138
|
+
lines.each do |line|
|
139
|
+
if line.start_with?( '|' )
|
140
|
+
## assume continuation with line of alternative names
|
141
|
+
## note: skip leading pipe
|
142
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
143
|
+
|
144
|
+
add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
|
145
|
+
|
146
|
+
## check for b (child) team / club marker e.g.
|
147
|
+
## (ii) or ii) or ii.) or (ii.)
|
148
|
+
## (b) or b) or b.) or (b.)
|
149
|
+
## (2) or 2) or 2.) or (2.)
|
150
|
+
elsif line =~ B_TEAM_MARKER_RE
|
151
|
+
line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
|
152
|
+
|
153
|
+
## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
|
154
|
+
# for both a and b team / club
|
155
|
+
rec = Club.new
|
156
|
+
value = line ## note: assume / allow just canonical name for now
|
157
|
+
## strip and squish (white)spaces
|
158
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
159
|
+
value = value.gsub( '$', '' ).strip
|
160
|
+
.gsub( /[ \t]+/, ' ' )
|
161
|
+
|
162
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
163
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
164
|
+
|
165
|
+
### link a and b team / clubs
|
166
|
+
## assume last_rec is the a team
|
167
|
+
## todo/fix: check last_rec required NOT null
|
168
|
+
rec.a = last_rec
|
169
|
+
last_rec.b = rec
|
170
|
+
|
171
|
+
last_rec = rec
|
172
|
+
recs << rec
|
173
|
+
|
174
|
+
## check for address line e.g.
|
175
|
+
## use just one style / syntax - why? why not?
|
176
|
+
## Fischhofgasse 12 ~ 1100 Wien or
|
177
|
+
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
178
|
+
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
179
|
+
elsif line =~ ADDR_MARKER_RE
|
180
|
+
# note skip for now!!!
|
181
|
+
# todo/fix: add support for address line!!!
|
182
|
+
puts " skipping address line for now >#{line}<"
|
183
|
+
else
|
184
|
+
values = line.split( ',' )
|
185
|
+
|
186
|
+
rec = Club.new
|
187
|
+
|
188
|
+
col = values.shift ## get first item
|
189
|
+
## note: allow optional alt names for convenience with required canoncial name
|
190
|
+
names = col.split( '|' ) # team names - allow/use pipe(|)
|
191
|
+
value = names[0] ## canonical name
|
192
|
+
alt_names = names[1..-1] ## optional (inline) alt names
|
193
|
+
|
194
|
+
## strip and squish (white)spaces
|
195
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
196
|
+
value = value.gsub( '$', '' ).strip
|
197
|
+
.gsub( /[ \t]+/, ' ' )
|
198
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
199
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
200
|
+
|
201
|
+
## note: add optional (inline) alternate names if present
|
202
|
+
add_alt_names( rec, alt_names ) if alt_names.size > 0
|
203
|
+
|
204
|
+
## note:
|
205
|
+
## check/todo!!!!!!!!!!!!!!!!!-
|
206
|
+
## strip year if to present e.g. (2011-)
|
207
|
+
##
|
208
|
+
## do NOT strip for defunct / historic clubs e.g.
|
209
|
+
## (1899-1910)
|
210
|
+
## or (-1914) or (-2011) etc.
|
211
|
+
|
212
|
+
###
|
213
|
+
## todo: move year out of canonical team name - why? why not?
|
214
|
+
|
215
|
+
## check if canonical name include (2011-) or similar in name
|
216
|
+
## if yes, remove (2011-) and add to (alt) names
|
217
|
+
## e.g. New York FC (2011) => New York FC
|
218
|
+
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
219
|
+
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
220
|
+
|
221
|
+
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
222
|
+
rec.year = $1.to_i
|
223
|
+
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
224
|
+
rec.year_end = $1.to_i
|
225
|
+
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
226
|
+
rec.year = $1.to_i
|
227
|
+
rec.year_end = $2.to_i
|
228
|
+
else
|
229
|
+
## todo/check: warn about unknown year format
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
## todo/check - check for unknown format values
|
234
|
+
## e.g. too many values, duplicate years, etc.
|
235
|
+
## check for overwritting, etc.
|
236
|
+
while values.size > 0
|
237
|
+
value = values.shift
|
238
|
+
## strip and squish (white)spaces
|
239
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
240
|
+
value = value.strip.gsub( /[ \t]+/, ' ' )
|
241
|
+
if value =~/^\d{4}$/ # e.g 1904
|
242
|
+
## todo/check: issue warning if year is already set!!!!!!!
|
243
|
+
if rec.year
|
244
|
+
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
245
|
+
pp rec
|
246
|
+
exit 1
|
247
|
+
end
|
248
|
+
rec.year = value.to_i
|
249
|
+
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
250
|
+
## cut-off leading @ and spaces
|
251
|
+
rec.ground = value[1..-1].strip
|
252
|
+
else
|
253
|
+
## assume city / geo tree
|
254
|
+
## split into geo tree
|
255
|
+
geos = split_geo( value )
|
256
|
+
city = geos[0]
|
257
|
+
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
258
|
+
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
259
|
+
rec.district = $1.strip
|
260
|
+
city = city.gsub( /\(.+?\)/, '' ).strip
|
261
|
+
end
|
262
|
+
rec.city = city
|
263
|
+
|
264
|
+
if geos.size > 1
|
265
|
+
## cut-off city and keep the rest (of geo tree)
|
266
|
+
rec.geos = geos[1..-1]
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end ## while values
|
270
|
+
|
271
|
+
|
272
|
+
###############
|
273
|
+
## use headings text for geo tree
|
274
|
+
|
275
|
+
## 1) add country if present
|
276
|
+
if headings.size > 0 && headings[0]
|
277
|
+
country = catalog.countries.find( headings[0] )
|
278
|
+
rec.country = country
|
279
|
+
else
|
280
|
+
## make it an error - why? why not?
|
281
|
+
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
282
|
+
exit 1
|
283
|
+
end
|
284
|
+
|
285
|
+
## 2) check geo tree with headings hierarchy
|
286
|
+
if headings.size > 1 && headings[1]
|
287
|
+
geos = split_geo( headings[1] )
|
288
|
+
if rec.geos
|
289
|
+
if rec.geos[0] != geos[0]
|
290
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
291
|
+
exit 1
|
292
|
+
end
|
293
|
+
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
294
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
295
|
+
exit 1
|
296
|
+
end
|
297
|
+
else
|
298
|
+
## add missing region (state/province) from headings hierarchy
|
299
|
+
rec.geos = geos
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
last_rec = rec
|
304
|
+
|
305
|
+
|
306
|
+
### todo/fix:
|
307
|
+
## auto-add alt name with dots stripped - why? why not?
|
308
|
+
## e.g. D.C. United => DC United
|
309
|
+
## e.g. Liverpool F.C. => Liverpool FC
|
310
|
+
## e.g. St. Albin => St Albin etc.
|
311
|
+
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
312
|
+
|
313
|
+
##
|
314
|
+
## todo/fix: unify mapping entries
|
315
|
+
## always lowercase !!!! (case insensitive)
|
316
|
+
## always strip (2011-) !!!
|
317
|
+
## always strip dots (e.g. St., F.C, etc.)
|
318
|
+
|
319
|
+
recs << rec
|
320
|
+
end
|
321
|
+
end # each line (in paragraph)
|
322
|
+
else
|
323
|
+
puts "** !!! ERROR !!! [club reader] - unknown line type:"
|
324
|
+
pp node
|
325
|
+
exit 1
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
recs
|
330
|
+
end # method read
|
331
|
+
|
332
|
+
#######################
|
333
|
+
### helpers
|
334
|
+
def split_geo( str )
|
335
|
+
## assume city / geo tree
|
336
|
+
## strip and squish (white)spaces
|
337
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
338
|
+
str = str.strip.gsub( /[ \t]+/, ' ' )
|
339
|
+
|
340
|
+
## split into geo tree
|
341
|
+
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
342
|
+
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
343
|
+
geos
|
344
|
+
end
|
345
|
+
|
346
|
+
end # class ClubReader
|
347
|
+
|
348
|
+
|
349
|
+
end ## module Import
|
350
|
+
end ## module SportDb
|