sportdb-config 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +0 -0
- data/Manifest.txt +1 -11
- data/Rakefile +8 -4
- data/lib/sportdb/config.rb +9 -33
- data/lib/sportdb/config/config.rb +36 -4
- data/lib/sportdb/config/version.rb +1 -1
- data/test/test_clubs.rb +76 -20
- metadata +48 -16
- data/lib/sportdb/config/club.rb +0 -147
- data/lib/sportdb/config/club_index.rb +0 -212
- data/lib/sportdb/config/club_reader.rb +0 -278
- data/lib/sportdb/config/countries.rb +0 -58
- data/lib/sportdb/config/variants.rb +0 -185
- data/lib/sportdb/config/wiki_reader.rb +0 -104
- data/test/test_club_index.rb +0 -100
- data/test/test_club_reader.rb +0 -150
- data/test/test_variants.rb +0 -46
- data/test/test_wiki_reader.rb +0 -77
data/lib/sportdb/config/club.rb
DELETED
@@ -1,147 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
module Import
|
5
|
-
|
6
|
-
##
|
7
|
-
# note: use our own (internal) club struct for now - why? why not?
|
8
|
-
# - check that shape/structure/fields/attributes match
|
9
|
-
# the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
|
10
|
-
class Club
|
11
|
-
## todo: use just names for alt_names - why? why not?
|
12
|
-
attr_accessor :name, :alt_names,
|
13
|
-
:year, :ground, :city
|
14
|
-
|
15
|
-
## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
|
16
|
-
attr_accessor :district, :geos, :year_end, :country
|
17
|
-
|
18
|
-
## special import only attribs
|
19
|
-
attr_accessor :alt_names_auto ## auto-generated alt names
|
20
|
-
attr_accessor :wikipedia # wikipedia page name (for english (en))
|
21
|
-
|
22
|
-
def historic?() @year_end ? true : false; end
|
23
|
-
alias_method :past?, :historic?
|
24
|
-
|
25
|
-
|
26
|
-
def wikipedia?() @wikipedia; end
|
27
|
-
def wikipedia_url
|
28
|
-
if @wikipedia
|
29
|
-
## note: replace spaces with underscore (-)
|
30
|
-
## e.g. Club Brugge KV => Club_Brugge_KV
|
31
|
-
## todo/check/fix:
|
32
|
-
## check if "plain" dash (-) needs to get replaced with typographic dash??
|
33
|
-
"https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
|
34
|
-
else
|
35
|
-
nil
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
def initialize
|
41
|
-
@alt_names = []
|
42
|
-
@alt_names_auto = []
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
## helper methods for import only
|
47
|
-
## check for duplicates
|
48
|
-
def duplicates?
|
49
|
-
names = [name] + alt_names + alt_names_auto
|
50
|
-
names = names.map { |name| normalize( sanitize(name) ) }
|
51
|
-
|
52
|
-
names.size != names.uniq.size
|
53
|
-
end
|
54
|
-
|
55
|
-
def duplicates
|
56
|
-
names = [name] + alt_names + alt_names_auto
|
57
|
-
|
58
|
-
## calculate (count) frequency and select if greater than one
|
59
|
-
names.reduce( Hash.new ) do |h,name|
|
60
|
-
norm = normalize( sanitize(name) )
|
61
|
-
h[norm] ||= []
|
62
|
-
h[norm] << name; h
|
63
|
-
end.select { |norm,names| names.size > 1 }
|
64
|
-
end
|
65
|
-
|
66
|
-
def add_variants( name_or_names )
|
67
|
-
names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
|
68
|
-
names.each do |name|
|
69
|
-
name = sanitize( name )
|
70
|
-
self.alt_names_auto += variants( name )
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
###################################
|
75
|
-
# "global" helper - move to ___ ? why? why not?
|
76
|
-
|
77
|
-
YEAR_REGEX = /\([0-9,\- ]+?\)/
|
78
|
-
def self.strip_year( name )
|
79
|
-
## check for year(s) e.g. (1887-1911), (-2013),
|
80
|
-
## (1946-2001, 2013-) etc.
|
81
|
-
name.gsub( YEAR_REGEX, '' ).strip
|
82
|
-
end
|
83
|
-
|
84
|
-
def self.has_year?( name ) name =~ YEAR_REGEX; end
|
85
|
-
|
86
|
-
LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
|
87
|
-
def self.strip_lang( name )
|
88
|
-
name.gsub( LANG_REGEX, '' ).strip
|
89
|
-
end
|
90
|
-
|
91
|
-
def self.has_lang?( name ) name =~ LANG_REGEX; end
|
92
|
-
|
93
|
-
def self.sanitize( name )
|
94
|
-
## check for year(s) e.g. (1887-1911), (-2013),
|
95
|
-
## (1946-2001,2013-) etc.
|
96
|
-
name = strip_year( name )
|
97
|
-
## check lang codes e.g. [en], [fr], etc.
|
98
|
-
name = strip_lang( name )
|
99
|
-
name
|
100
|
-
end
|
101
|
-
|
102
|
-
|
103
|
-
NORM_REGEX = /[.'º\-\/]/
|
104
|
-
## note: remove all dots (.), dash (-), ', º, /, etc.
|
105
|
-
## for norm(alizing) names
|
106
|
-
def self.strip_norm( name )
|
107
|
-
name.gsub( NORM_REGEX, '' )
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.normalize( name )
|
111
|
-
# note: do NOT call sanitize here (keep normalize "atomic" for reuse)
|
112
|
-
|
113
|
-
## remove all dots (.), dash (-), º, /, etc.
|
114
|
-
name = strip_norm( name )
|
115
|
-
name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
|
116
|
-
|
117
|
-
## todo/fix: use our own downcase - why? why not?
|
118
|
-
name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
|
119
|
-
name
|
120
|
-
end
|
121
|
-
|
122
|
-
|
123
|
-
def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
124
|
-
## note: strip disambiguationn qualifier from wikipedia page name if present
|
125
|
-
## note: only remove year and foot... for now
|
126
|
-
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
127
|
-
## Willem II (football club) => Willem II
|
128
|
-
##
|
129
|
-
## e.g. do NOT strip others !! e.g.
|
130
|
-
## América Futebol Clube (MG)
|
131
|
-
## only add more "special" cases on demand (that, is) if we find more
|
132
|
-
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
133
|
-
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
134
|
-
name
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
private
|
139
|
-
## private "shortcut" convenience helpers
|
140
|
-
def sanitize( name ) self.class.sanitize( name ); end
|
141
|
-
def normalize( name ) self.class.normalize( name ); end
|
142
|
-
|
143
|
-
def variants( name ) Variant.find( name ); end
|
144
|
-
end # class Club
|
145
|
-
|
146
|
-
end # module Import
|
147
|
-
end # module SportDb
|
@@ -1,212 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
module Import
|
5
|
-
|
6
|
-
|
7
|
-
class ClubIndex
|
8
|
-
|
9
|
-
def self.build( path )
|
10
|
-
recs = []
|
11
|
-
datafiles = Configuration.find_datafiles_clubs( path )
|
12
|
-
datafiles.each do |datafile|
|
13
|
-
recs += ClubReader.read( datafile )
|
14
|
-
end
|
15
|
-
recs
|
16
|
-
|
17
|
-
clubs = self.new
|
18
|
-
clubs.add( recs )
|
19
|
-
|
20
|
-
## add wiki(pedia) anchored links
|
21
|
-
recs = []
|
22
|
-
datafiles = Configuration.find_datafiles_clubs_wiki( path )
|
23
|
-
datafiles.each do |datafile|
|
24
|
-
recs += WikiReader.read( datafile )
|
25
|
-
end
|
26
|
-
|
27
|
-
pp recs
|
28
|
-
clubs.add_wiki( recs )
|
29
|
-
clubs
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
def initialize
|
35
|
-
@clubs = {} ## clubs (indexed) by canonical name
|
36
|
-
@clubs_by_name = {}
|
37
|
-
@errors = []
|
38
|
-
end
|
39
|
-
|
40
|
-
attr_reader :errors
|
41
|
-
def errors?() @errors.empty? == false; end
|
42
|
-
|
43
|
-
def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
|
44
|
-
def clubs() @clubs.values; end
|
45
|
-
|
46
|
-
|
47
|
-
## helpers from club - use a helper module for includes - why? why not?
|
48
|
-
def strip_year( name ) Club.strip_year( name ); end
|
49
|
-
def has_year?( name) Club.has_year?( name ); end
|
50
|
-
def strip_lang( name ) Club.strip_lang( name ); end
|
51
|
-
def strip_wiki( name ) Club.strip_wiki( name ); end
|
52
|
-
def normalize( name ) Club.normalize( name ); end
|
53
|
-
|
54
|
-
|
55
|
-
def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
|
56
|
-
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
57
|
-
|
58
|
-
recs.each do |rec|
|
59
|
-
## note: strip qualifier () from wikipedia page name if present
|
60
|
-
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
61
|
-
## Willem II (football club) => Willem II
|
62
|
-
##
|
63
|
-
## e.g. do NOT strip others !! e.g.
|
64
|
-
## América Futebol Clube (MG)
|
65
|
-
## only add more "special" cases on demand (that, is) if we find more
|
66
|
-
name = strip_wiki( rec.name )
|
67
|
-
|
68
|
-
m = match_by( name: name, country: rec.country )
|
69
|
-
if m.nil?
|
70
|
-
puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
|
71
|
-
exit 1
|
72
|
-
end
|
73
|
-
if m.size > 1
|
74
|
-
puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
|
75
|
-
pp m
|
76
|
-
exit 1
|
77
|
-
end
|
78
|
-
club = m[0]
|
79
|
-
club.wikipedia = rec.name
|
80
|
-
end
|
81
|
-
end # method add_wiki
|
82
|
-
|
83
|
-
|
84
|
-
def add( rec_or_recs ) ## add club record / alt_names
|
85
|
-
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
86
|
-
|
87
|
-
recs.each do |rec|
|
88
|
-
## puts "adding:"
|
89
|
-
## pp rec
|
90
|
-
### step 1) add canonical name
|
91
|
-
old_rec = @clubs[ rec.name ]
|
92
|
-
if old_rec
|
93
|
-
puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
|
94
|
-
pp old_rec
|
95
|
-
pp rec
|
96
|
-
exit 1
|
97
|
-
else
|
98
|
-
@clubs[ rec.name ] = rec
|
99
|
-
end
|
100
|
-
|
101
|
-
## step 2) add all names (canonical name + alt names + alt names (auto))
|
102
|
-
names = [rec.name] + rec.alt_names
|
103
|
-
more_names = []
|
104
|
-
## check "hand-typed" names for year (auto-add)
|
105
|
-
## check for year(s) e.g. (1887-1911), (-2013),
|
106
|
-
## (1946-2001,2013-) etc.
|
107
|
-
names.each do |name|
|
108
|
-
if has_year?( name )
|
109
|
-
more_names << strip_year( name )
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
names += more_names
|
114
|
-
## check for duplicates - simple check for now - fix/improve
|
115
|
-
## todo/fix: (auto)remove duplicates - why? why not?
|
116
|
-
count = names.size
|
117
|
-
count_uniq = names.uniq.size
|
118
|
-
if count != count_uniq
|
119
|
-
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
120
|
-
pp names
|
121
|
-
pp rec
|
122
|
-
exit 1
|
123
|
-
end
|
124
|
-
|
125
|
-
## check with auto-names just warn for now and do not exit
|
126
|
-
names += rec.alt_names_auto
|
127
|
-
count = names.size
|
128
|
-
count_uniq = names.uniq.size
|
129
|
-
if count != count_uniq
|
130
|
-
puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
|
131
|
-
pp names
|
132
|
-
pp rec
|
133
|
-
end
|
134
|
-
|
135
|
-
|
136
|
-
names.each_with_index do |name,i|
|
137
|
-
## check lang codes e.g. [en], [fr], etc.
|
138
|
-
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
139
|
-
name = strip_lang( name )
|
140
|
-
norm = normalize( name )
|
141
|
-
alt_recs = @clubs_by_name[ norm ]
|
142
|
-
if alt_recs
|
143
|
-
## check if include club rec already or is new club rec
|
144
|
-
if alt_recs.include?( rec )
|
145
|
-
## note: do NOT include duplicate club record
|
146
|
-
msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
|
147
|
-
puts msg
|
148
|
-
@errors << msg
|
149
|
-
else
|
150
|
-
msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
|
151
|
-
puts msg
|
152
|
-
@errors << msg
|
153
|
-
alt_recs << rec
|
154
|
-
end
|
155
|
-
else
|
156
|
-
@clubs_by_name[ norm ] = [rec]
|
157
|
-
end
|
158
|
-
end
|
159
|
-
end
|
160
|
-
end # method add
|
161
|
-
|
162
|
-
|
163
|
-
def []( name ) ## lookup by canoncial name only
|
164
|
-
@clubs[ name ]
|
165
|
-
end
|
166
|
-
|
167
|
-
def match( name )
|
168
|
-
## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
|
169
|
-
name = normalize( name )
|
170
|
-
@clubs_by_name[ name ]
|
171
|
-
end
|
172
|
-
|
173
|
-
|
174
|
-
def match_by( name:, country: )
|
175
|
-
## note: match must for now always include name
|
176
|
-
m = match( name )
|
177
|
-
if m ## filter by country
|
178
|
-
## note: country assumes / allows the country key or fifa code for now
|
179
|
-
|
180
|
-
## note: allow passing in of country struct too
|
181
|
-
country_rec = if country.is_a?( SportDb::Import::Country )
|
182
|
-
country ## (re)use country struct - no need to run lookup again
|
183
|
-
else
|
184
|
-
rec = SportDb::Import.config.countries[ country ]
|
185
|
-
if rec.nil?
|
186
|
-
puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
|
187
|
-
exit 1
|
188
|
-
end
|
189
|
-
rec
|
190
|
-
end
|
191
|
-
|
192
|
-
m = m.select { |club| club.country.key == country_rec.key }
|
193
|
-
m = nil if m.empty? ## note: reset to nil if no more matches
|
194
|
-
end
|
195
|
-
m
|
196
|
-
end
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
def dump_duplicates # debug helper - report duplicate club name records
|
201
|
-
@clubs_by_name.each do |name, clubs|
|
202
|
-
if clubs.size > 1
|
203
|
-
puts "#{clubs.size} matching club duplicates for >#{name}<:"
|
204
|
-
pp clubs
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end # class ClubIndex
|
209
|
-
|
210
|
-
|
211
|
-
end # module Import
|
212
|
-
end # module SportDb
|
@@ -1,278 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module SportDb
|
5
|
-
module Import
|
6
|
-
|
7
|
-
|
8
|
-
class ClubReader
|
9
|
-
|
10
|
-
|
11
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
12
|
-
txt = File.open( path, 'r:utf-8' ).read
|
13
|
-
parse( txt )
|
14
|
-
end
|
15
|
-
|
16
|
-
|
17
|
-
def self.parse( txt )
|
18
|
-
recs = []
|
19
|
-
last_rec = nil
|
20
|
-
headings = [] ## headings stack
|
21
|
-
|
22
|
-
txt.each_line do |line|
|
23
|
-
line = line.strip
|
24
|
-
|
25
|
-
next if line.empty?
|
26
|
-
next if line.start_with?( '#' ) ## skip comments too
|
27
|
-
|
28
|
-
## strip inline (until end-of-line) comments too
|
29
|
-
## e.g Eupen => KAS Eupen, ## [de]
|
30
|
-
## => Eupen => KAS Eupen,
|
31
|
-
line = line.sub( /#.*/, '' ).strip
|
32
|
-
pp line
|
33
|
-
|
34
|
-
|
35
|
-
next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
|
36
|
-
|
37
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
38
|
-
## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
|
39
|
-
if line =~ /^(={1,}) ## leading ======
|
40
|
-
([^=]+?) ## text (note: for now no "inline" = allowed)
|
41
|
-
=* ## (optional) trailing ====
|
42
|
-
$/x
|
43
|
-
heading_marker = $1
|
44
|
-
heading_level = $1.length ## count number of = for heading level
|
45
|
-
heading = $2.strip
|
46
|
-
|
47
|
-
puts "heading #{heading_level} >#{heading}<"
|
48
|
-
|
49
|
-
## 1) first pop headings if present
|
50
|
-
while headings.size+1 > heading_level
|
51
|
-
headings.pop
|
52
|
-
end
|
53
|
-
|
54
|
-
## 2) add missing (hierarchy) level if
|
55
|
-
while headings.size+1 < heading_level
|
56
|
-
## todo/fix: issue warning about "skipping" hierarchy level
|
57
|
-
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
58
|
-
headings.push( nil )
|
59
|
-
end
|
60
|
-
|
61
|
-
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
62
|
-
## keep level empty
|
63
|
-
else
|
64
|
-
|
65
|
-
## quick hack: if level is 1 assume country for now
|
66
|
-
## and extract country code e.g.
|
67
|
-
## Austria (at) => at
|
68
|
-
## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
|
69
|
-
if heading_level == 1
|
70
|
-
if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
|
71
|
-
country_code = $1
|
72
|
-
|
73
|
-
## check country code - MUST exist for now!!!!
|
74
|
-
country = SportDb::Import.config.countries[ country_code ]
|
75
|
-
if country.nil?
|
76
|
-
puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
|
77
|
-
exit 1
|
78
|
-
end
|
79
|
-
|
80
|
-
headings.push( country.key )
|
81
|
-
else
|
82
|
-
puts "!!! error - heading level 1 - missing country code - >#{heading}<"
|
83
|
-
exit 1
|
84
|
-
end
|
85
|
-
else
|
86
|
-
## quick hack:
|
87
|
-
## remove known fill/dummy words incl:
|
88
|
-
## Provincia San Juan => San Juan (see argentina, for example)
|
89
|
-
##
|
90
|
-
## use geo tree long term with alternative names - why? why not?
|
91
|
-
words = ['Provincia']
|
92
|
-
words.each { |word| heading = heading.gsub( word, '' ) }
|
93
|
-
heading = heading.strip
|
94
|
-
|
95
|
-
headings.push( heading )
|
96
|
-
end
|
97
|
-
|
98
|
-
## assert that hierarchy level is ok
|
99
|
-
if headings.size != heading_level
|
100
|
-
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
101
|
-
exit 1
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
pp headings
|
106
|
-
|
107
|
-
elsif line.start_with?( '|' )
|
108
|
-
## assume continuation with line of alternative names
|
109
|
-
## note: skip leading pipe
|
110
|
-
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
111
|
-
## strip and squish (white)spaces
|
112
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
113
|
-
values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
|
114
|
-
last_rec.alt_names += values
|
115
|
-
last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
|
116
|
-
|
117
|
-
## check for duplicates
|
118
|
-
if last_rec.duplicates?
|
119
|
-
duplicates = last_rec.duplicates
|
120
|
-
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
121
|
-
pp duplicates
|
122
|
-
pp last_rec
|
123
|
-
##
|
124
|
-
## todo/fix: make it only an error with exit 1
|
125
|
-
## if (not normalized) names are the same (not unique/uniq)
|
126
|
-
## e.g. don't exit on A.F.C. == AFC etc.
|
127
|
-
## exit 1
|
128
|
-
end
|
129
|
-
else
|
130
|
-
values = line.split( ',' )
|
131
|
-
|
132
|
-
rec = Club.new
|
133
|
-
value = values.shift ## get first item
|
134
|
-
## strip and squish (white)spaces
|
135
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
136
|
-
value = value.strip.gsub( /[ \t]+/, ' ' )
|
137
|
-
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
138
|
-
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
139
|
-
|
140
|
-
## note:
|
141
|
-
## check/todo!!!!!!!!!!!!!!!!!-
|
142
|
-
## strip year if to present e.g. (2011-)
|
143
|
-
##
|
144
|
-
## do NOT strip for defunct / historic clubs e.g.
|
145
|
-
## (1899-1910)
|
146
|
-
## or (-1914) or (-2011) etc.
|
147
|
-
|
148
|
-
###
|
149
|
-
## todo: move year out of canonical team name - why? why not?
|
150
|
-
|
151
|
-
## check if canonical name include (2011-) or similar in name
|
152
|
-
## if yes, remove (2011-) and add to (alt) names
|
153
|
-
## e.g. New York FC (2011) => New York FC
|
154
|
-
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
155
|
-
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
156
|
-
|
157
|
-
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
158
|
-
rec.year = $1.to_i
|
159
|
-
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
160
|
-
rec.year_end = $1.to_i
|
161
|
-
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
162
|
-
rec.year = $1.to_i
|
163
|
-
rec.year_end = $2.to_i
|
164
|
-
else
|
165
|
-
## todo/check: warn about unknown year format
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
## todo/check - check for unknown format values
|
170
|
-
## e.g. too many values, duplicate years, etc.
|
171
|
-
## check for overwritting, etc.
|
172
|
-
while values.size > 0
|
173
|
-
value = values.shift
|
174
|
-
## strip and squish (white)spaces
|
175
|
-
# e.g. León › Guanajuato => León › Guanajuato
|
176
|
-
value = value.strip.gsub( /[ \t]+/, ' ' )
|
177
|
-
if value =~/^\d{4}$/ # e.g 1904
|
178
|
-
## todo/check: issue warning if year is already set!!!!!!!
|
179
|
-
if rec.year
|
180
|
-
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
181
|
-
pp rec
|
182
|
-
exit 1
|
183
|
-
end
|
184
|
-
rec.year = value.to_i
|
185
|
-
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
186
|
-
## cut-off leading @ and spaces
|
187
|
-
rec.ground = value[1..-1].strip
|
188
|
-
else
|
189
|
-
## assume city / geo tree
|
190
|
-
## split into geo tree
|
191
|
-
geos = split_geo( value )
|
192
|
-
city = geos[0]
|
193
|
-
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
194
|
-
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
195
|
-
rec.district = $1.strip
|
196
|
-
city = city.gsub( /\(.+?\)/, '' ).strip
|
197
|
-
end
|
198
|
-
rec.city = city
|
199
|
-
|
200
|
-
if geos.size > 1
|
201
|
-
## cut-off city and keep the rest (of geo tree)
|
202
|
-
rec.geos = geos[1..-1]
|
203
|
-
end
|
204
|
-
end
|
205
|
-
end ## while values
|
206
|
-
|
207
|
-
|
208
|
-
###############
|
209
|
-
## use headings text for geo tree
|
210
|
-
|
211
|
-
## 1) add country if present
|
212
|
-
if headings.size > 0 && headings[0]
|
213
|
-
country = SportDb::Import.config.countries[ headings[0] ]
|
214
|
-
rec.country = country
|
215
|
-
else
|
216
|
-
## make it an error - why? why not?
|
217
|
-
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
218
|
-
exit 1
|
219
|
-
end
|
220
|
-
|
221
|
-
## 2) check geo tree with headings hierarchy
|
222
|
-
if headings.size > 1 && headings[1]
|
223
|
-
geos = split_geo( headings[1] )
|
224
|
-
if rec.geos
|
225
|
-
if rec.geos[0] != geos[0]
|
226
|
-
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
227
|
-
exit 1
|
228
|
-
end
|
229
|
-
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
230
|
-
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
231
|
-
exit 1
|
232
|
-
end
|
233
|
-
else
|
234
|
-
## add missing region (state/province) from headings hierarchy
|
235
|
-
rec.geos = geos
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
last_rec = rec
|
240
|
-
|
241
|
-
|
242
|
-
### todo/fix:
|
243
|
-
## auto-add alt name with dots stripped - why? why not?
|
244
|
-
## e.g. D.C. United => DC United
|
245
|
-
## e.g. Liverpool F.C. => Liverpool FC
|
246
|
-
## e.g. St. Albin => St Albin etc.
|
247
|
-
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
248
|
-
|
249
|
-
##
|
250
|
-
## todo/fix: unify mapping entries
|
251
|
-
## always lowercase !!!! (case insensitive)
|
252
|
-
## always strip (2011-) !!!
|
253
|
-
## always strip dots (e.g. St., F.C, etc.)
|
254
|
-
|
255
|
-
recs << rec
|
256
|
-
end
|
257
|
-
end # each_line
|
258
|
-
recs
|
259
|
-
end # method read
|
260
|
-
|
261
|
-
### helpers
|
262
|
-
def self.split_geo( str )
|
263
|
-
## assume city / geo tree
|
264
|
-
## strip and squish (white)spaces
|
265
|
-
# e.g. León › Guanajuato => León › Guanajuato
|
266
|
-
str = str.strip.gsub( /[ \t]+/, ' ' )
|
267
|
-
|
268
|
-
## split into geo tree
|
269
|
-
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
270
|
-
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
271
|
-
geos
|
272
|
-
end
|
273
|
-
|
274
|
-
end # class ClubReader
|
275
|
-
|
276
|
-
|
277
|
-
end ## module Import
|
278
|
-
end ## module SportDb
|