rsssf 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -0
- data/Manifest.txt +39 -2
- data/README.md +67 -62
- data/Rakefile +2 -2
- data/config/groups_en.txt +44 -0
- data/config/rounds_en.txt +283 -0
- data/config/rounds_es.txt +20 -0
- data/config/rounds_misc.txt +7 -0
- data/lib/_cocos_.rb +158 -0
- data/lib/rsssf/convert/convert.rb +71 -0
- data/lib/rsssf/convert/errata.rb +103 -0
- data/lib/rsssf/convert/html_entities.rb +150 -0
- data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
- data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
- data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
- data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
- data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
- data/lib/rsssf/convert/html_to_txt.rb +247 -0
- data/lib/rsssf/download.rb +4 -135
- data/lib/rsssf/fmtfix/dates.rb +541 -0
- data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
- data/lib/rsssf/fmtfix/errata.rb +44 -0
- data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
- data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
- data/lib/rsssf/fmtfix/goals.rb +173 -0
- data/lib/rsssf/fmtfix/headers.rb +326 -0
- data/lib/rsssf/fmtfix/outline.rb +228 -0
- data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
- data/lib/rsssf/fmtfix/rounds.rb +74 -0
- data/lib/rsssf/fmtfix/score.rb +92 -0
- data/lib/rsssf/fmtfix/tables.rb +316 -0
- data/lib/rsssf/fmtfix/topscorers.rb +50 -0
- data/lib/rsssf/page-find_schedule.rb +127 -0
- data/lib/rsssf/page-meta.rb +68 -0
- data/lib/rsssf/page.rb +89 -227
- data/lib/rsssf/parse_schedules.rb +34 -0
- data/lib/rsssf/prepare/convert-links.rb +77 -0
- data/lib/rsssf/prepare/convert-meta.rb +111 -0
- data/lib/rsssf/prepare/convert-navlines.rb +154 -0
- data/lib/rsssf/prepare/convert-postproc.rb +141 -0
- data/lib/rsssf/prepare/convert.rb +100 -0
- data/lib/rsssf/prepare/download.rb +40 -0
- data/lib/rsssf/project.rb +154 -0
- data/lib/rsssf/reports/page.rb +40 -8
- data/lib/rsssf/reports/schedule.rb +18 -55
- data/lib/rsssf/utils.rb +28 -17
- data/lib/rsssf/version.rb +5 -2
- data/lib/rsssf.rb +53 -13
- metadata +50 -9
- data/lib/rsssf/convert.rb +0 -495
- data/lib/rsssf/repo.rb +0 -144
data/lib/rsssf/page.rb
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
|
|
2
2
|
|
|
3
3
|
module Rsssf
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
|
|
6
6
|
PageStat = Struct.new(
|
|
7
7
|
:source, ## e.g. https://rsssf.org/tabled/duit89.html
|
|
8
8
|
:year, ## e.g. 1989 -- note: always four digits
|
|
9
|
+
:title,
|
|
9
10
|
:authors,
|
|
10
11
|
:last_updated,
|
|
11
12
|
:line_count, ## todo: rename to (just) lines - why? why not?
|
|
@@ -25,13 +26,13 @@ module Rsssf
|
|
|
25
26
|
class Page
|
|
26
27
|
|
|
27
28
|
include Utils ## e.g. year_from_name, etc.
|
|
28
|
-
|
|
29
|
+
|
|
29
30
|
def self.read_cache( url ) ### use read_cache /web/html or such - why? why not?
|
|
30
31
|
html = Webcache.read( url )
|
|
31
32
|
|
|
32
33
|
puts "html:"
|
|
33
34
|
pp html[0..400]
|
|
34
|
-
|
|
35
|
+
|
|
35
36
|
txt = PageConverter.convert( html, url: url )
|
|
36
37
|
txt
|
|
37
38
|
|
|
@@ -41,7 +42,7 @@ end
|
|
|
41
42
|
|
|
42
43
|
def self.read_txt( path ) ## use read_txt
|
|
43
44
|
# note: always assume sources (already) converted from html to txt!!!!
|
|
44
|
-
txt = read_text( path )
|
|
45
|
+
txt = read_text( path )
|
|
45
46
|
new( txt )
|
|
46
47
|
end
|
|
47
48
|
|
|
@@ -49,260 +50,125 @@ end
|
|
|
49
50
|
|
|
50
51
|
### use text alias too (for txt) - why? why not?
|
|
51
52
|
attr_accessor :txt
|
|
52
|
-
|
|
53
|
-
## quick hack? used for auto-patch machinery
|
|
54
|
-
attr_accessor :patch
|
|
55
53
|
attr_accessor :url ### source url
|
|
56
54
|
|
|
57
55
|
|
|
58
56
|
def initialize( txt )
|
|
59
|
-
@txt
|
|
60
|
-
|
|
61
|
-
@patch = nil
|
|
57
|
+
@txt = txt
|
|
62
58
|
@url = nil
|
|
63
59
|
end
|
|
64
60
|
|
|
65
61
|
|
|
66
|
-
LEAGUE_ROUND_REGEX = /\b
|
|
67
|
-
Round
|
|
68
|
-
\b/ix
|
|
69
|
-
|
|
70
|
-
CUP_ROUND_REGEX = /\b(
|
|
71
|
-
Round |
|
|
72
|
-
1\/8\sFinals |
|
|
73
|
-
1\/16\sFinals |
|
|
74
|
-
Quarterfinals |
|
|
75
|
-
Semifinals |
|
|
76
|
-
Final
|
|
77
|
-
)\b/ix
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
## make header required - why? why not?
|
|
82
|
-
def find_schedule( header: nil,
|
|
83
|
-
cup: false ) ## change to build_schedule - why? why not???
|
|
84
|
-
|
|
85
|
-
## find match schedule/fixtures in multi-league doc
|
|
86
|
-
new_txt = String.new
|
|
87
|
-
|
|
88
|
-
## note: keep track of statistics
|
|
89
|
-
## e.g. number of rounds found
|
|
90
|
-
|
|
91
|
-
round_count = 0
|
|
92
|
-
|
|
93
|
-
if header
|
|
94
|
-
league_header_found = false
|
|
95
|
-
|
|
96
|
-
## header:
|
|
97
|
-
## - assumes heading 4 e.g. #### Premier League or
|
|
98
|
-
## - bold e.g. **FA Cup** for now
|
|
99
|
-
## note: markers must start line (^)
|
|
100
|
-
|
|
101
|
-
## note:
|
|
102
|
-
## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
|
|
103
|
-
header_esc = header.gsub( ' ', '\s' )
|
|
104
|
-
|
|
105
|
-
## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
|
|
106
|
-
## use [#] hack ??
|
|
107
|
-
header_regex = /^
|
|
108
|
-
([#]{2,4}\s+(#{header_esc}))
|
|
109
|
-
|
|
|
110
|
-
(\*{2}(#{header_esc})) ## was: \*{2})
|
|
111
|
-
## do not inluce trailing ** for now (allows anchors e.g. §)
|
|
112
|
-
/ix
|
|
113
|
-
|
|
114
|
-
## todo:
|
|
115
|
-
## use new stage_regex e.g. **xxx** - why? why not?
|
|
116
|
-
## allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
|
|
117
|
-
|
|
118
|
-
else
|
|
119
|
-
league_header_found = true # default (no header; assume single league file)
|
|
120
|
-
header_regex = /^---dummy---$/ ## non-matching dummy regex
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
## puts "header_regex:"
|
|
124
|
-
## pp header_regex
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if cup
|
|
128
|
-
round_regex = CUP_ROUND_REGEX ## note: only allow final, quaterfinals, etc. if knockout cup
|
|
129
|
-
else
|
|
130
|
-
round_regex = LEAGUE_ROUND_REGEX
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
## stages
|
|
135
|
-
first_round_header_found = false
|
|
136
|
-
round_header_found = false
|
|
137
|
-
round_body_found = false ## allow round header followed by blank lines
|
|
138
|
-
|
|
139
|
-
blank_found = false
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
@txt.each_line do |line|
|
|
144
|
-
|
|
145
|
-
if league_header_found == false
|
|
146
|
-
## first find start of league header/section
|
|
147
|
-
if line =~ header_regex
|
|
148
|
-
puts "!!! bingo - found header >#{line}<"
|
|
149
|
-
league_header_found = true
|
|
150
|
-
|
|
151
|
-
## note - do NOT auto-add header/title !!!
|
|
152
|
-
# title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
|
|
153
|
-
# new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
|
|
154
|
-
else
|
|
155
|
-
puts " searching for header >#{header}<; skipping line >#{line}<"
|
|
156
|
-
next
|
|
157
|
-
end
|
|
158
|
-
elsif first_round_header_found == false
|
|
159
|
-
## next look for first round (starting w/ Round)
|
|
160
|
-
if line =~ round_regex
|
|
161
|
-
puts "!!! bingo - found first round >#{line}<"
|
|
162
|
-
round_count += 1
|
|
163
|
-
first_round_header_found = true
|
|
164
|
-
round_header_found = true
|
|
165
|
-
round_body_found = false
|
|
166
|
-
new_txt << line
|
|
167
|
-
elsif line =~ /^=-=-=-=/
|
|
168
|
-
puts "*** no rounds found; hit section marker (horizontal rule)"
|
|
169
|
-
break
|
|
170
|
-
elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
|
|
171
|
-
puts "*** no rounds found; hit section/stage header: #{line}"
|
|
172
|
-
break
|
|
173
|
-
else
|
|
174
|
-
puts " searching for first round; skipping line >#{line}<"
|
|
175
|
-
next ## continue; searching
|
|
176
|
-
end
|
|
177
|
-
elsif round_header_found == true
|
|
178
|
-
## collect rounds;
|
|
179
|
-
## assume text block until next blank line
|
|
180
|
-
## new block must allways start w/ round
|
|
181
|
-
if line =~ /^\s*$/ ## blank line?
|
|
182
|
-
if round_body_found
|
|
183
|
-
round_header_found = false
|
|
184
|
-
blank_found = true ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
|
|
185
|
-
new_txt << line
|
|
186
|
-
else
|
|
187
|
-
## note: skip blanks following header
|
|
188
|
-
next
|
|
189
|
-
end
|
|
190
|
-
else
|
|
191
|
-
round_body_found = true
|
|
192
|
-
new_txt << line ## keep going until next blank line
|
|
193
|
-
end
|
|
194
|
-
else
|
|
195
|
-
## skip (more) blank lines
|
|
196
|
-
if line =~ /^\s*$/
|
|
197
|
-
next ## continue; skip extra blank line
|
|
198
|
-
elsif line =~ round_regex
|
|
199
|
-
puts "!!! bingo - found new round >#{line}<"
|
|
200
|
-
round_count += 1
|
|
201
|
-
round_header_found = true # more rounds; continue
|
|
202
|
-
round_body_found = false
|
|
203
|
-
blank_found = false # reset blank tracker
|
|
204
|
-
new_txt << line
|
|
205
|
-
elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
|
|
206
|
-
puts "!!! bingo - continue round >#{line}<"
|
|
207
|
-
round_header_found = true
|
|
208
|
-
blank_found = false # reset blank tracker
|
|
209
|
-
new_txt << line
|
|
210
|
-
elsif blank_found && line =~ /First Legs|Second Legs/i
|
|
211
|
-
puts "!!! bingo - continue round >#{line}<"
|
|
212
|
-
round_header_found = true
|
|
213
|
-
blank_found = false # reset blank tracker
|
|
214
|
-
new_txt << line
|
|
215
|
-
elsif line =~ /=-=-=-=/
|
|
216
|
-
puts "!!! stop schedule; hit section marker (horizontal rule)"
|
|
217
|
-
break;
|
|
218
|
-
elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
|
|
219
|
-
puts "!!! stop schedule; hit section/stage header: #{line}"
|
|
220
|
-
break
|
|
221
|
-
else
|
|
222
|
-
blank_found = false
|
|
223
|
-
puts "skipping line in schedule >#{line}<"
|
|
224
|
-
next # continue
|
|
225
|
-
end
|
|
226
|
-
end
|
|
227
|
-
end # each line
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
## quick hack?
|
|
231
|
-
### auto-apply patch if patch configured
|
|
232
|
-
if @patch && @patch.respond_to?(:on_patch)
|
|
233
|
-
url_path = URI.parse( url ).path
|
|
234
|
-
basename = File.basename( url_path, File.extname( url_path ))
|
|
235
|
-
year = year_from_name( basename )
|
|
236
|
-
new_txt = @patch.on_patch( new_txt, basename, year )
|
|
237
|
-
end
|
|
238
|
-
|
|
239
|
-
schedule = Schedule.new( new_txt )
|
|
240
|
-
## schedule.rounds = round_count
|
|
241
|
-
|
|
242
|
-
schedule
|
|
243
|
-
end # method find_schedule
|
|
244
62
|
|
|
245
63
|
|
|
246
64
|
|
|
65
|
+
## let's you check optional ref e.g. ‹§fin›
|
|
66
|
+
### todo/fix - change to OPT_REF_RE - make it regex
|
|
67
|
+
## regex embedded in regex will use regex.source automatic (no need to escape)!!
|
|
68
|
+
## let's you check optional ref e.g. ‹§fin›
|
|
69
|
+
OPT_REF = %q{
|
|
70
|
+
(?: [ ]*
|
|
71
|
+
‹§ (?<ref> [^›]+?) ›
|
|
72
|
+
)?
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
HX_RE = %r{ ## negative lookahead
|
|
77
|
+
## do NOT match =-=
|
|
78
|
+
## do NOT match =========== (without any heading text!!)
|
|
79
|
+
## e.g.
|
|
80
|
+
## Fall season
|
|
81
|
+
## ===========
|
|
82
|
+
|
|
83
|
+
(?! ^[ ]* (?: =-=
|
|
84
|
+
| ={1,} [ ]* $
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
^
|
|
89
|
+
[ ]*
|
|
90
|
+
|
|
91
|
+
(?<marker> ={1,6})
|
|
92
|
+
[ ]*
|
|
93
|
+
(?<text> .+?)
|
|
94
|
+
#{OPT_REF}
|
|
95
|
+
[ ]*
|
|
96
|
+
$}x
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
##
|
|
101
|
+
## change to outline - why? why not?
|
|
102
|
+
def _scan_headings() txt.scan( HX_RE ); end
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _build_toc( txt )
|
|
107
|
+
|
|
108
|
+
hx = txt.scan( HX_RE )
|
|
109
|
+
|
|
110
|
+
toc = []
|
|
111
|
+
hx.each do |marker,text,ref|
|
|
112
|
+
toc << "#{marker} #{text}"
|
|
113
|
+
end
|
|
114
|
+
toc
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
=begin
|
|
122
|
+
<!--
|
|
123
|
+
title: Austria 2002/03
|
|
124
|
+
source: https://rsssf.org/tableso/oost03.html
|
|
125
|
+
authors: Andreas Exenberger and Karel Stokkermans
|
|
126
|
+
updated: 15 Jun 2022
|
|
127
|
+
-->
|
|
128
|
+
|
|
129
|
+
=end
|
|
130
|
+
|
|
131
|
+
|
|
247
132
|
def build_stat
|
|
133
|
+
title = nil
|
|
248
134
|
source = nil
|
|
249
135
|
authors = nil
|
|
250
136
|
last_updated = nil
|
|
251
137
|
|
|
252
|
-
|
|
253
|
-
if @txt =~ /source: ([^ \n]+)/im
|
|
254
|
-
source = $1.to_s
|
|
255
|
-
puts "source: >#{source}<"
|
|
256
|
-
end
|
|
138
|
+
meta = parse_meta( @txt ) || {}
|
|
257
139
|
|
|
258
|
-
|
|
259
|
-
|
|
140
|
+
title = meta[:title]
|
|
141
|
+
source = meta[:source]
|
|
142
|
+
authors = meta[:author] || meta[:authors] ## note - check for author & authors !!!
|
|
143
|
+
last_updated = meta[:updated]
|
|
260
144
|
|
|
261
|
-
if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
|
|
262
|
-
last_updated = $2.to_s # note: save a copy first (gets "reset" by next regex)
|
|
263
|
-
authors = $1.to_s.strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style
|
|
264
|
-
authors = authors.gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before)
|
|
265
|
-
puts "authors: >#{authors}<"
|
|
266
|
-
puts "last updated: >#{last_updated}<"
|
|
267
|
-
end
|
|
268
145
|
|
|
269
|
-
puts "*** !!! missing source"
|
|
270
|
-
puts "*** !!! missing
|
|
146
|
+
puts "*** !!! missing source" if source.nil?
|
|
147
|
+
puts "*** !!! missing author(s)" if authors.nil?
|
|
148
|
+
puts "** !!! missing last updated" if last_updated.nil?
|
|
271
149
|
|
|
272
150
|
|
|
273
151
|
## get year from source (url)
|
|
152
|
+
### move (for reuse) to year_from_url in utils - why? why not?
|
|
274
153
|
url_path = URI.parse( source ).path
|
|
275
154
|
basename = File.basename( url_path, File.extname( url_path ) ) ## e.g. duit92.txt or duit92.html => duit92
|
|
276
155
|
puts " basename=>#{basename}<"
|
|
277
156
|
year = year_from_name( basename )
|
|
278
|
-
|
|
279
157
|
|
|
280
|
-
sections = []
|
|
281
158
|
|
|
282
|
-
|
|
283
|
-
line_count = 0
|
|
284
|
-
@txt.each_line do |line|
|
|
285
|
-
line_count +=1
|
|
159
|
+
sections = _build_toc( txt )
|
|
286
160
|
|
|
287
|
-
### find sections
|
|
288
|
-
## todo: add more patterns? how? why?
|
|
289
|
-
if line =~ /####\s+(.+)/
|
|
290
|
-
puts " found section >#{$1}<"
|
|
291
|
-
## remove anchors first e.g. ‹§sa› etc.
|
|
292
|
-
## check if anchors with underscore (_) or dash/hyphen (-) ???
|
|
293
|
-
sections << $1.sub( /‹§[a-z0-9]+›/, '' ).strip
|
|
294
|
-
end
|
|
295
|
-
end
|
|
296
161
|
|
|
297
162
|
|
|
298
163
|
rec = PageStat.new
|
|
299
164
|
rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
|
|
300
|
-
rec.year = year
|
|
165
|
+
rec.year = year ## note: in 2021/22 - year is always end_year, that is, 2022
|
|
166
|
+
rec.title = title
|
|
301
167
|
rec.authors = authors
|
|
302
168
|
rec.last_updated = last_updated
|
|
303
|
-
rec.line_count =
|
|
304
|
-
rec.char_count = @txt.size
|
|
305
|
-
rec.sections = sections
|
|
169
|
+
rec.line_count = @txt.lines.count ### or @txt.each_line.count
|
|
170
|
+
rec.char_count = @txt.size ## note - size/length is true char count (@txt.bytesize is byte count!!)
|
|
171
|
+
rec.sections = sections
|
|
306
172
|
|
|
307
173
|
rec
|
|
308
174
|
end ## method build_stat
|
|
@@ -314,7 +180,3 @@ end ## method save
|
|
|
314
180
|
|
|
315
181
|
end ## class Page
|
|
316
182
|
end ## module Rsssf
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
##
|
|
4
|
+
## todo/check - find a better name
|
|
5
|
+
## rename to parse_sections/leagues/??? - why? why not?
|
|
6
|
+
def parse_schedules( txt )
|
|
7
|
+
rows = parse_csv( txt )
|
|
8
|
+
## transform seasons column to seasons objects
|
|
9
|
+
rows.each do |row|
|
|
10
|
+
if row['seasons'] && !row['seasons'].empty?
|
|
11
|
+
row['seasons'] = Season.parse_line( row['seasons'] )
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
rows
|
|
15
|
+
end
|
|
16
|
+
def read_schedules( path ) parse_schedules( read_text(path)); end
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__END__
|
|
22
|
+
|
|
23
|
+
############
|
|
24
|
+
## sample usage
|
|
25
|
+
|
|
26
|
+
configs = parse_schedules( <<TXT )
|
|
27
|
+
|
|
28
|
+
header, seasons, basename, title
|
|
29
|
+
Bundesliga, 2010/11..2025/26, 1-bundesliga, Austria | Bundesliga {season}
|
|
30
|
+
ÖFB Cup, 2010/11..2025/26, cup, Austria | ÖFB Cup {season}
|
|
31
|
+
|
|
32
|
+
TXT
|
|
33
|
+
|
|
34
|
+
## pp configs
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
class Prep ## todo: find a better name e.g. BatchPrep or ??
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
## see page 2006f
|
|
8
|
+
## see page ../tablesw/worldcup›
|
|
9
|
+
## e.g. ‹League C, see page 2023uefanl§lgc›
|
|
10
|
+
## ‹League A, see page 2023uefanl.html#lga›
|
|
11
|
+
## todo/fix - fix upstream ?? (e.g. remove. html and replace #=>§)
|
|
12
|
+
LINK_APAGE_RE = %r{ ‹(?<title> [^›]+?)
|
|
13
|
+
, [ ] see [ ] page [ ]
|
|
14
|
+
(?<pageref> [^›]+?)
|
|
15
|
+
›
|
|
16
|
+
}ix
|
|
17
|
+
|
|
18
|
+
=begin
|
|
19
|
+
["1973/74", "oost74"],
|
|
20
|
+
["1975/76", "oost76"],
|
|
21
|
+
["list of final tables", "oosthist"],
|
|
22
|
+
["list of champions", "oostchamp"],
|
|
23
|
+
["list of cup finals", "oostcuphist"],
|
|
24
|
+
["list of super cup finals", "oostsupcuphist"],
|
|
25
|
+
["list of foundation dates", "oostfound"]]
|
|
26
|
+
=end
|
|
27
|
+
|
|
28
|
+
def expand_pageref( pageref, dirname: )
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
## note - pre-proces
|
|
32
|
+
## 2023uefanl.html#lga
|
|
33
|
+
## stkitts2025.html#pres
|
|
34
|
+
##
|
|
35
|
+
## remove .html
|
|
36
|
+
## and optional anchor
|
|
37
|
+
##
|
|
38
|
+
## fix - upstream - why? why not?
|
|
39
|
+
|
|
40
|
+
pageref = pageref.sub( %r{ \.html\b }ix, '' )
|
|
41
|
+
## check - only really one # allowed in url path???
|
|
42
|
+
pageref = pageref.sub( '#', '§' )
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if /^[a-z0-9][a-z0-9§-]*$/.match?( pageref )
|
|
46
|
+
## assume relative page in "local" dir
|
|
47
|
+
"#{dirname}/#{pageref}"
|
|
48
|
+
elsif pageref.start_with?( '../')
|
|
49
|
+
## ../tablesw/worldcup
|
|
50
|
+
pageref.sub( "../", '' )
|
|
51
|
+
elsif pageref.start_with?( './' )
|
|
52
|
+
raise ArgumentError, "found (unsupported) ./ pageref >#{pageref}<"
|
|
53
|
+
elsif pageref.start_with?( '/' )
|
|
54
|
+
raise ArgumentError, "found (unsupported) / pageref >#{pageref}<"
|
|
55
|
+
elsif pageref.start_with?( %r{^https?:}i )
|
|
56
|
+
raise ArgumentError, "found (unsupported) https?: pageref >#{pageref}<"
|
|
57
|
+
else
|
|
58
|
+
raise ArgumentError, "found (unsupported) pageref >#{pageref}<"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def collect_links( txt, basename:, dirname: )
|
|
64
|
+
|
|
65
|
+
links = txt.scan( LINK_APAGE_RE )
|
|
66
|
+
|
|
67
|
+
links.map do |link|
|
|
68
|
+
link[1] = expand_pageref( link[1], dirname: dirname )
|
|
69
|
+
link
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
links
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
end ## class Prep
|
|
77
|
+
end ## module Rsssf
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
class Prep ## todo: find a better name e.g. BatchPrep or ??
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
###
|
|
7
|
+
# note - check for special cases (later) with no about this docu section!!
|
|
8
|
+
#
|
|
9
|
+
## https://rsssf.org/tablesb/braz98.html
|
|
10
|
+
## has not about document section
|
|
11
|
+
# and only a last update: 22 Apr 1999 line (no author)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
TITLE_RE = %r{
|
|
18
|
+
<TITLE>(?<text>.*?)</TITLE>
|
|
19
|
+
}ixm
|
|
20
|
+
|
|
21
|
+
def find_title( html )
|
|
22
|
+
if m=TITLE_RE.match( html )
|
|
23
|
+
text = m[:text].strip
|
|
24
|
+
|
|
25
|
+
## note - convert html entities
|
|
26
|
+
## e.g. Brazil 2000 - Copa João Havelange
|
|
27
|
+
text = PageConverter.convert_html_entities( text )
|
|
28
|
+
|
|
29
|
+
## add autofix known typos/erratas here!!!
|
|
30
|
+
## note - title quick typo fix (in brazil) remove <
|
|
31
|
+
## e.g. <TITLE>Brazil 1988<</TITLE>
|
|
32
|
+
text = text.gsub( '<', '' )
|
|
33
|
+
|
|
34
|
+
text
|
|
35
|
+
else
|
|
36
|
+
nil
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
ABOUT_META_RE = %r{
|
|
43
|
+
## (i) author(s) info
|
|
44
|
+
\b authors? [ ]* :
|
|
45
|
+
\s+
|
|
46
|
+
(?<author> .+?) ## note - non-greedy (may incl. newline break!!)
|
|
47
|
+
\s+
|
|
48
|
+
## (ii) followed by date
|
|
49
|
+
\b last [ ]+ updated [ ]*:
|
|
50
|
+
\s*
|
|
51
|
+
(?<date> \d{1,2} [ ]+ ## day
|
|
52
|
+
[a-z]{3,10} [ ]+ ## month
|
|
53
|
+
\d{4} \b) ## year
|
|
54
|
+
}ixm
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## change name to authors_n_updated or such - why? why not?
|
|
59
|
+
def find_author_n_date( txt )
|
|
60
|
+
##
|
|
61
|
+
## fix/todo: move authors n last updated
|
|
62
|
+
## whitespace cleanup - why? why not??
|
|
63
|
+
|
|
64
|
+
if m=ABOUT_META_RE.match( txt )
|
|
65
|
+
|
|
66
|
+
authors = m[:author].strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style
|
|
67
|
+
authors = authors.gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before)
|
|
68
|
+
|
|
69
|
+
updated = m[:date].strip.gsub(/\s+/, ' ' )
|
|
70
|
+
|
|
71
|
+
[authors, updated]
|
|
72
|
+
else
|
|
73
|
+
## report error or raise exception??
|
|
74
|
+
## return nil for now
|
|
75
|
+
[nil,nil] ## or return (single) nil ??
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
end ## class Prep
|
|
81
|
+
end ## module Rsssf
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
=begin
|
|
86
|
+
e.g.
|
|
87
|
+
|
|
88
|
+
Authors: Hans Schöggl, Jan Schoenmakers and Karel Stokkermans
|
|
89
|
+
|
|
90
|
+
Last updated: 7 Mar 2023
|
|
91
|
+
|
|
92
|
+
-or-
|
|
93
|
+
|
|
94
|
+
Authors: Ambrosius Kutschera
|
|
95
|
+
and Karel Stokkermans
|
|
96
|
+
Last updated: 31 Oct 2004
|
|
97
|
+
|
|
98
|
+
-or-
|
|
99
|
+
|
|
100
|
+
Author: RSSSF
|
|
101
|
+
|
|
102
|
+
Last updated: 15 Jun 2022
|
|
103
|
+
|
|
104
|
+
-or-
|
|
105
|
+
|
|
106
|
+
Authors: Andreas Exenberger, Hans Schöggl
|
|
107
|
+
and Karel Stokkermans
|
|
108
|
+
|
|
109
|
+
Last updated: 15 Jul 2022
|
|
110
|
+
|
|
111
|
+
=end
|