rsssf 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +4 -0
- data/Manifest.txt +41 -7
- data/README.md +93 -71
- data/Rakefile +8 -7
- data/config/groups_en.txt +44 -0
- data/config/rounds_en.txt +283 -0
- data/config/rounds_es.txt +20 -0
- data/config/rounds_misc.txt +7 -0
- data/lib/_cocos_.rb +158 -0
- data/lib/rsssf/convert/convert.rb +71 -0
- data/lib/rsssf/convert/errata.rb +103 -0
- data/lib/rsssf/convert/html_entities.rb +150 -0
- data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
- data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
- data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
- data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
- data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
- data/lib/rsssf/convert/html_to_txt.rb +247 -0
- data/lib/rsssf/download.rb +20 -0
- data/lib/rsssf/fmtfix/dates.rb +541 -0
- data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
- data/lib/rsssf/fmtfix/errata.rb +44 -0
- data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
- data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
- data/lib/rsssf/fmtfix/goals.rb +173 -0
- data/lib/rsssf/fmtfix/headers.rb +326 -0
- data/lib/rsssf/fmtfix/outline.rb +228 -0
- data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
- data/lib/rsssf/fmtfix/rounds.rb +74 -0
- data/lib/rsssf/fmtfix/score.rb +92 -0
- data/lib/rsssf/fmtfix/tables.rb +316 -0
- data/lib/rsssf/fmtfix/topscorers.rb +50 -0
- data/lib/rsssf/page-find_schedule.rb +127 -0
- data/lib/rsssf/page-meta.rb +68 -0
- data/lib/rsssf/page.rb +125 -238
- data/lib/rsssf/parse_schedules.rb +34 -0
- data/lib/rsssf/prepare/convert-links.rb +77 -0
- data/lib/rsssf/prepare/convert-meta.rb +111 -0
- data/lib/rsssf/prepare/convert-navlines.rb +154 -0
- data/lib/rsssf/prepare/convert-postproc.rb +141 -0
- data/lib/rsssf/prepare/convert.rb +100 -0
- data/lib/rsssf/prepare/download.rb +40 -0
- data/lib/rsssf/project.rb +154 -0
- data/lib/rsssf/reports/page.rb +66 -23
- data/lib/rsssf/reports/schedule.rb +89 -40
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +37 -45
- data/lib/rsssf/version.rb +7 -6
- data/lib/rsssf.rb +82 -19
- metadata +68 -26
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/lib/rsssf/repo.rb +0 -220
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
data/lib/rsssf/page.rb
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
# encoding: utf-8
|
|
2
1
|
|
|
3
2
|
|
|
4
3
|
module Rsssf
|
|
5
4
|
|
|
5
|
+
|
|
6
6
|
PageStat = Struct.new(
|
|
7
|
-
:source, ## e.g.
|
|
8
|
-
:basename, ## e.g. duit89 -- note: filename w/o extension (and path)
|
|
7
|
+
:source, ## e.g. https://rsssf.org/tabled/duit89.html
|
|
9
8
|
:year, ## e.g. 1989 -- note: always four digits
|
|
10
|
-
:
|
|
9
|
+
:title,
|
|
11
10
|
:authors,
|
|
12
11
|
:last_updated,
|
|
13
12
|
:line_count, ## todo: rename to (just) lines - why? why not?
|
|
@@ -28,268 +27,156 @@ class Page
|
|
|
28
27
|
|
|
29
28
|
include Utils ## e.g. year_from_name, etc.
|
|
30
29
|
|
|
31
|
-
def self.
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
30
|
+
def self.read_cache( url ) ### use read_cache /web/html or such - why? why not?
|
|
31
|
+
html = Webcache.read( url )
|
|
32
|
+
|
|
33
|
+
puts "html:"
|
|
34
|
+
pp html[0..400]
|
|
35
35
|
|
|
36
|
+
txt = PageConverter.convert( html, url: url )
|
|
37
|
+
txt
|
|
36
38
|
|
|
37
|
-
|
|
38
|
-
txt = File.read_utf8( path ) # note: always assume sources (already) converted to utf-8
|
|
39
|
-
self.from_string( txt )
|
|
39
|
+
new( txt )
|
|
40
40
|
end
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
|
|
43
|
+
def self.read_txt( path ) ## use read_txt
|
|
44
|
+
# note: always assume sources (already) converted from html to txt!!!!
|
|
45
|
+
txt = read_text( path )
|
|
46
|
+
new( txt )
|
|
44
47
|
end
|
|
45
|
-
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
### use text alias too (for txt) - why? why not?
|
|
52
|
+
attr_accessor :txt
|
|
53
|
+
attr_accessor :url ### source url
|
|
54
|
+
|
|
55
|
+
|
|
46
56
|
def initialize( txt )
|
|
47
|
-
@txt
|
|
57
|
+
@txt = txt
|
|
58
|
+
@url = nil
|
|
48
59
|
end
|
|
49
60
|
|
|
50
61
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
blank_found = false
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
@txt.each_line do |line|
|
|
125
|
-
|
|
126
|
-
if league_header_found == false
|
|
127
|
-
## first find start of league header/section
|
|
128
|
-
if line =~ header_regex
|
|
129
|
-
puts "!!! bingo - found header >#{line}<"
|
|
130
|
-
league_header_found = true
|
|
131
|
-
title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
|
|
132
|
-
new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
|
|
133
|
-
else
|
|
134
|
-
puts " searching for header >#{header}<; skipping line >#{line}<"
|
|
135
|
-
next
|
|
136
|
-
end
|
|
137
|
-
elsif first_round_header_found == false
|
|
138
|
-
## next look for first round (starting w/ Round)
|
|
139
|
-
if line =~ round_regex
|
|
140
|
-
puts "!!! bingo - found first round >#{line}<"
|
|
141
|
-
round_count += 1
|
|
142
|
-
first_round_header_found = true
|
|
143
|
-
round_header_found = true
|
|
144
|
-
round_body_found = false
|
|
145
|
-
new_txt << line
|
|
146
|
-
elsif line =~ /^=-=-=-=/
|
|
147
|
-
puts "*** no rounds found; hit section marker (horizontal rule)"
|
|
148
|
-
break
|
|
149
|
-
elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
|
|
150
|
-
puts "*** no rounds found; hit section/stage header: #{line}"
|
|
151
|
-
break
|
|
152
|
-
else
|
|
153
|
-
puts " searching for first round; skipping line >#{line}<"
|
|
154
|
-
next ## continue; searching
|
|
155
|
-
end
|
|
156
|
-
elsif round_header_found == true
|
|
157
|
-
## collect rounds;
|
|
158
|
-
## assume text block until next blank line
|
|
159
|
-
## new block must allways start w/ round
|
|
160
|
-
if line =~ /^\s*$/ ## blank line?
|
|
161
|
-
if round_body_found
|
|
162
|
-
round_header_found = false
|
|
163
|
-
blank_found = true ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
|
|
164
|
-
new_txt << line
|
|
165
|
-
else
|
|
166
|
-
## note: skip blanks following header
|
|
167
|
-
next
|
|
168
|
-
end
|
|
169
|
-
else
|
|
170
|
-
round_body_found = true
|
|
171
|
-
new_txt << line ## keep going until next blank line
|
|
172
|
-
end
|
|
173
|
-
else
|
|
174
|
-
## skip (more) blank lines
|
|
175
|
-
if line =~ /^\s*$/
|
|
176
|
-
next ## continue; skip extra blank line
|
|
177
|
-
elsif line =~ round_regex
|
|
178
|
-
puts "!!! bingo - found new round >#{line}<"
|
|
179
|
-
round_count += 1
|
|
180
|
-
round_header_found = true # more rounds; continue
|
|
181
|
-
round_body_found = false
|
|
182
|
-
blank_found = false # reset blank tracker
|
|
183
|
-
new_txt << line
|
|
184
|
-
elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
|
|
185
|
-
puts "!!! bingo - continue round >#{line}<"
|
|
186
|
-
round_header_found = true
|
|
187
|
-
blank_found = false # reset blank tracker
|
|
188
|
-
new_txt << line
|
|
189
|
-
elsif blank_found && line =~ /First Legs|Second Legs/i
|
|
190
|
-
puts "!!! bingo - continue round >#{line}<"
|
|
191
|
-
round_header_found = true
|
|
192
|
-
blank_found = false # reset blank tracker
|
|
193
|
-
new_txt << line
|
|
194
|
-
elsif line =~ /=-=-=-=/
|
|
195
|
-
puts "!!! stop schedule; hit section marker (horizontal rule)"
|
|
196
|
-
break;
|
|
197
|
-
elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
|
|
198
|
-
puts "!!! stop schedule; hit section/stage header: #{line}"
|
|
199
|
-
break
|
|
200
|
-
else
|
|
201
|
-
blank_found = false
|
|
202
|
-
puts "skipping line in schedule >#{line}<"
|
|
203
|
-
next # continue
|
|
204
|
-
end
|
|
205
|
-
end
|
|
206
|
-
end # each line
|
|
207
|
-
|
|
208
|
-
schedule = Schedule.from_string( new_txt )
|
|
209
|
-
schedule.rounds = round_count
|
|
210
|
-
|
|
211
|
-
schedule
|
|
212
|
-
end # method find_schedule
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## let's you check optional ref e.g. ‹§fin›
|
|
66
|
+
### todo/fix - change to OPT_REF_RE - make it regex
|
|
67
|
+
## regex embedded in regex will use regex.source automatic (no need to escape)!!
|
|
68
|
+
## let's you check optional ref e.g. ‹§fin›
|
|
69
|
+
OPT_REF = %q{
|
|
70
|
+
(?: [ ]*
|
|
71
|
+
‹§ (?<ref> [^›]+?) ›
|
|
72
|
+
)?
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
HX_RE = %r{ ## negative lookahead
|
|
77
|
+
## do NOT match =-=
|
|
78
|
+
## do NOT match =========== (without any heading text!!)
|
|
79
|
+
## e.g.
|
|
80
|
+
## Fall season
|
|
81
|
+
## ===========
|
|
82
|
+
|
|
83
|
+
(?! ^[ ]* (?: =-=
|
|
84
|
+
| ={1,} [ ]* $
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
^
|
|
89
|
+
[ ]*
|
|
90
|
+
|
|
91
|
+
(?<marker> ={1,6})
|
|
92
|
+
[ ]*
|
|
93
|
+
(?<text> .+?)
|
|
94
|
+
#{OPT_REF}
|
|
95
|
+
[ ]*
|
|
96
|
+
$}x
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
##
|
|
101
|
+
## change to outline - why? why not?
|
|
102
|
+
def _scan_headings() txt.scan( HX_RE ); end
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _build_toc( txt )
|
|
107
|
+
|
|
108
|
+
hx = txt.scan( HX_RE )
|
|
109
|
+
|
|
110
|
+
toc = []
|
|
111
|
+
hx.each do |marker,text,ref|
|
|
112
|
+
toc << "#{marker} #{text}"
|
|
113
|
+
end
|
|
114
|
+
toc
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
=begin
|
|
122
|
+
<!--
|
|
123
|
+
title: Austria 2002/03
|
|
124
|
+
source: https://rsssf.org/tableso/oost03.html
|
|
125
|
+
authors: Andreas Exenberger and Karel Stokkermans
|
|
126
|
+
updated: 15 Jun 2022
|
|
127
|
+
-->
|
|
128
|
+
|
|
129
|
+
=end
|
|
213
130
|
|
|
214
131
|
|
|
215
132
|
def build_stat
|
|
133
|
+
title = nil
|
|
216
134
|
source = nil
|
|
217
135
|
authors = nil
|
|
218
136
|
last_updated = nil
|
|
219
137
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
puts "
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
@txt.each_line do |line|
|
|
245
|
-
line_count +=1
|
|
246
|
-
|
|
247
|
-
### find sections
|
|
248
|
-
## todo: add more patterns? how? why?
|
|
249
|
-
if line =~ /####\s+(.+)/
|
|
250
|
-
puts " found section >#{$1}<"
|
|
251
|
-
sections << $1.strip
|
|
252
|
-
end
|
|
253
|
-
end
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
# get path from url
|
|
257
|
-
url = URI.parse( source )
|
|
258
|
-
## pp url
|
|
259
|
-
## puts url.host
|
|
260
|
-
path = url.path
|
|
261
|
-
extname = File.extname( path )
|
|
262
|
-
basename = File.basename( path, extname ) ## e.g. duit92.txt or duit92.html => duit92
|
|
263
|
-
year = year_from_name( basename )
|
|
264
|
-
season = year_to_season( year )
|
|
138
|
+
meta = parse_meta( @txt ) || {}
|
|
139
|
+
|
|
140
|
+
title = meta[:title]
|
|
141
|
+
source = meta[:source]
|
|
142
|
+
authors = meta[:author] || meta[:authors] ## note - check for author & authors !!!
|
|
143
|
+
last_updated = meta[:updated]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
puts "*** !!! missing source" if source.nil?
|
|
147
|
+
puts "*** !!! missing author(s)" if authors.nil?
|
|
148
|
+
puts "** !!! missing last updated" if last_updated.nil?
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
## get year from source (url)
|
|
152
|
+
### move (for reuse) to year_from_url in utils - why? why not?
|
|
153
|
+
url_path = URI.parse( source ).path
|
|
154
|
+
basename = File.basename( url_path, File.extname( url_path ) ) ## e.g. duit92.txt or duit92.html => duit92
|
|
155
|
+
puts " basename=>#{basename}<"
|
|
156
|
+
year = year_from_name( basename )
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
sections = _build_toc( txt )
|
|
160
|
+
|
|
161
|
+
|
|
265
162
|
|
|
266
163
|
rec = PageStat.new
|
|
267
164
|
rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
|
|
268
|
-
rec.
|
|
269
|
-
rec.
|
|
270
|
-
rec.season = season
|
|
165
|
+
rec.year = year ## note: in 2021/22 - year is always end_year, that is, 2022
|
|
166
|
+
rec.title = title
|
|
271
167
|
rec.authors = authors
|
|
272
168
|
rec.last_updated = last_updated
|
|
273
|
-
rec.line_count =
|
|
274
|
-
rec.char_count = @txt.size
|
|
275
|
-
rec.sections = sections
|
|
169
|
+
rec.line_count = @txt.lines.count ### or @txt.each_line.count
|
|
170
|
+
rec.char_count = @txt.size ## note - size/length is true char count (@txt.bytesize is byte count!!)
|
|
171
|
+
rec.sections = sections
|
|
276
172
|
|
|
277
173
|
rec
|
|
278
174
|
end ## method build_stat
|
|
279
175
|
|
|
280
176
|
|
|
281
177
|
def save( path )
|
|
282
|
-
|
|
283
|
-
f.write @txt
|
|
284
|
-
end
|
|
178
|
+
write_text( path, @txt )
|
|
285
179
|
end ## method save
|
|
286
180
|
|
|
287
181
|
end ## class Page
|
|
288
182
|
end ## module Rsssf
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
## add (shortcut) alias
|
|
292
|
-
RsssfPageStat = Rsssf::PageStat
|
|
293
|
-
RsssfPage = Rsssf::Page
|
|
294
|
-
|
|
295
|
-
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
##
|
|
4
|
+
## todo/check - find a better name
|
|
5
|
+
## rename to parse_sections/leagues/??? - why? why not?
|
|
6
|
+
def parse_schedules( txt )
|
|
7
|
+
rows = parse_csv( txt )
|
|
8
|
+
## transform seasons column to seasons objects
|
|
9
|
+
rows.each do |row|
|
|
10
|
+
if row['seasons'] && !row['seasons'].empty?
|
|
11
|
+
row['seasons'] = Season.parse_line( row['seasons'] )
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
rows
|
|
15
|
+
end
|
|
16
|
+
def read_schedules( path ) parse_schedules( read_text(path)); end
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__END__
|
|
22
|
+
|
|
23
|
+
############
|
|
24
|
+
## sample usage
|
|
25
|
+
|
|
26
|
+
configs = parse_schedules( <<TXT )
|
|
27
|
+
|
|
28
|
+
header, seasons, basename, title
|
|
29
|
+
Bundesliga, 2010/11..2025/26, 1-bundesliga, Austria | Bundesliga {season}
|
|
30
|
+
ÖFB Cup, 2010/11..2025/26, cup, Austria | ÖFB Cup {season}
|
|
31
|
+
|
|
32
|
+
TXT
|
|
33
|
+
|
|
34
|
+
## pp configs
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
class Prep ## todo: find a better name e.g. BatchPrep or ??
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
## see page 2006f
|
|
8
|
+
## see page ../tablesw/worldcup›
|
|
9
|
+
## e.g. ‹League C, see page 2023uefanl§lgc›
|
|
10
|
+
## ‹League A, see page 2023uefanl.html#lga›
|
|
11
|
+
## todo/fix - fix upstream ?? (e.g. remove. html and replace #=>§)
|
|
12
|
+
LINK_APAGE_RE = %r{ ‹(?<title> [^›]+?)
|
|
13
|
+
, [ ] see [ ] page [ ]
|
|
14
|
+
(?<pageref> [^›]+?)
|
|
15
|
+
›
|
|
16
|
+
}ix
|
|
17
|
+
|
|
18
|
+
=begin
|
|
19
|
+
["1973/74", "oost74"],
|
|
20
|
+
["1975/76", "oost76"],
|
|
21
|
+
["list of final tables", "oosthist"],
|
|
22
|
+
["list of champions", "oostchamp"],
|
|
23
|
+
["list of cup finals", "oostcuphist"],
|
|
24
|
+
["list of super cup finals", "oostsupcuphist"],
|
|
25
|
+
["list of foundation dates", "oostfound"]]
|
|
26
|
+
=end
|
|
27
|
+
|
|
28
|
+
def expand_pageref( pageref, dirname: )
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
## note - pre-proces
|
|
32
|
+
## 2023uefanl.html#lga
|
|
33
|
+
## stkitts2025.html#pres
|
|
34
|
+
##
|
|
35
|
+
## remove .html
|
|
36
|
+
## and optional anchor
|
|
37
|
+
##
|
|
38
|
+
## fix - upstream - why? why not?
|
|
39
|
+
|
|
40
|
+
pageref = pageref.sub( %r{ \.html\b }ix, '' )
|
|
41
|
+
## check - only really one # allowed in url path???
|
|
42
|
+
pageref = pageref.sub( '#', '§' )
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if /^[a-z0-9][a-z0-9§-]*$/.match?( pageref )
|
|
46
|
+
## assume relative page in "local" dir
|
|
47
|
+
"#{dirname}/#{pageref}"
|
|
48
|
+
elsif pageref.start_with?( '../')
|
|
49
|
+
## ../tablesw/worldcup
|
|
50
|
+
pageref.sub( "../", '' )
|
|
51
|
+
elsif pageref.start_with?( './' )
|
|
52
|
+
raise ArgumentError, "found (unsupported) ./ pageref >#{pageref}<"
|
|
53
|
+
elsif pageref.start_with?( '/' )
|
|
54
|
+
raise ArgumentError, "found (unsupported) / pageref >#{pageref}<"
|
|
55
|
+
elsif pageref.start_with?( %r{^https?:}i )
|
|
56
|
+
raise ArgumentError, "found (unsupported) https?: pageref >#{pageref}<"
|
|
57
|
+
else
|
|
58
|
+
raise ArgumentError, "found (unsupported) pageref >#{pageref}<"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def collect_links( txt, basename:, dirname: )
|
|
64
|
+
|
|
65
|
+
links = txt.scan( LINK_APAGE_RE )
|
|
66
|
+
|
|
67
|
+
links.map do |link|
|
|
68
|
+
link[1] = expand_pageref( link[1], dirname: dirname )
|
|
69
|
+
link
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
links
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
end ## class Prep
|
|
77
|
+
end ## module Rsssf
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
class Prep ## todo: find a better name e.g. BatchPrep or ??
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
###
|
|
7
|
+
# note - check for special cases (later) with no about this docu section!!
|
|
8
|
+
#
|
|
9
|
+
## https://rsssf.org/tablesb/braz98.html
|
|
10
|
+
## has not about document section
|
|
11
|
+
# and only a last update: 22 Apr 1999 line (no author)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
TITLE_RE = %r{
|
|
18
|
+
<TITLE>(?<text>.*?)</TITLE>
|
|
19
|
+
}ixm
|
|
20
|
+
|
|
21
|
+
def find_title( html )
|
|
22
|
+
if m=TITLE_RE.match( html )
|
|
23
|
+
text = m[:text].strip
|
|
24
|
+
|
|
25
|
+
## note - convert html entities
|
|
26
|
+
## e.g. Brazil 2000 - Copa João Havelange
|
|
27
|
+
text = PageConverter.convert_html_entities( text )
|
|
28
|
+
|
|
29
|
+
## add autofix known typos/erratas here!!!
|
|
30
|
+
## note - title quick typo fix (in brazil) remove <
|
|
31
|
+
## e.g. <TITLE>Brazil 1988<</TITLE>
|
|
32
|
+
text = text.gsub( '<', '' )
|
|
33
|
+
|
|
34
|
+
text
|
|
35
|
+
else
|
|
36
|
+
nil
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
ABOUT_META_RE = %r{
|
|
43
|
+
## (i) author(s) info
|
|
44
|
+
\b authors? [ ]* :
|
|
45
|
+
\s+
|
|
46
|
+
(?<author> .+?) ## note - non-greedy (may incl. newline break!!)
|
|
47
|
+
\s+
|
|
48
|
+
## (ii) followed by date
|
|
49
|
+
\b last [ ]+ updated [ ]*:
|
|
50
|
+
\s*
|
|
51
|
+
(?<date> \d{1,2} [ ]+ ## day
|
|
52
|
+
[a-z]{3,10} [ ]+ ## month
|
|
53
|
+
\d{4} \b) ## year
|
|
54
|
+
}ixm
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## change name to authors_n_updated or such - why? why not?
|
|
59
|
+
def find_author_n_date( txt )
|
|
60
|
+
##
|
|
61
|
+
## fix/todo: move authors n last updated
|
|
62
|
+
## whitespace cleanup - why? why not??
|
|
63
|
+
|
|
64
|
+
if m=ABOUT_META_RE.match( txt )
|
|
65
|
+
|
|
66
|
+
authors = m[:author].strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style
|
|
67
|
+
authors = authors.gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before)
|
|
68
|
+
|
|
69
|
+
updated = m[:date].strip.gsub(/\s+/, ' ' )
|
|
70
|
+
|
|
71
|
+
[authors, updated]
|
|
72
|
+
else
|
|
73
|
+
## report error or raise exception??
|
|
74
|
+
## return nil for now
|
|
75
|
+
[nil,nil] ## or return (single) nil ??
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
end ## class Prep
|
|
81
|
+
end ## module Rsssf
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
=begin
|
|
86
|
+
e.g.
|
|
87
|
+
|
|
88
|
+
Authors: Hans Schöggl, Jan Schoenmakers and Karel Stokkermans
|
|
89
|
+
|
|
90
|
+
Last updated: 7 Mar 2023
|
|
91
|
+
|
|
92
|
+
-or-
|
|
93
|
+
|
|
94
|
+
Authors: Ambrosius Kutschera
|
|
95
|
+
and Karel Stokkermans
|
|
96
|
+
Last updated: 31 Oct 2004
|
|
97
|
+
|
|
98
|
+
-or-
|
|
99
|
+
|
|
100
|
+
Author: RSSSF
|
|
101
|
+
|
|
102
|
+
Last updated: 15 Jun 2022
|
|
103
|
+
|
|
104
|
+
-or-
|
|
105
|
+
|
|
106
|
+
Authors: Andreas Exenberger, Hans Schöggl
|
|
107
|
+
and Karel Stokkermans
|
|
108
|
+
|
|
109
|
+
Last updated: 15 Jul 2022
|
|
110
|
+
|
|
111
|
+
=end
|