rsssf 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -0
- data/Manifest.txt +39 -2
- data/README.md +67 -62
- data/Rakefile +2 -2
- data/config/groups_en.txt +44 -0
- data/config/rounds_en.txt +283 -0
- data/config/rounds_es.txt +20 -0
- data/config/rounds_misc.txt +7 -0
- data/lib/_cocos_.rb +158 -0
- data/lib/rsssf/convert/convert.rb +71 -0
- data/lib/rsssf/convert/errata.rb +103 -0
- data/lib/rsssf/convert/html_entities.rb +150 -0
- data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
- data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
- data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
- data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
- data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
- data/lib/rsssf/convert/html_to_txt.rb +247 -0
- data/lib/rsssf/download.rb +4 -135
- data/lib/rsssf/fmtfix/dates.rb +541 -0
- data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
- data/lib/rsssf/fmtfix/errata.rb +44 -0
- data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
- data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
- data/lib/rsssf/fmtfix/goals.rb +173 -0
- data/lib/rsssf/fmtfix/headers.rb +326 -0
- data/lib/rsssf/fmtfix/outline.rb +228 -0
- data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
- data/lib/rsssf/fmtfix/rounds.rb +74 -0
- data/lib/rsssf/fmtfix/score.rb +92 -0
- data/lib/rsssf/fmtfix/tables.rb +316 -0
- data/lib/rsssf/fmtfix/topscorers.rb +50 -0
- data/lib/rsssf/page-find_schedule.rb +127 -0
- data/lib/rsssf/page-meta.rb +68 -0
- data/lib/rsssf/page.rb +89 -227
- data/lib/rsssf/parse_schedules.rb +34 -0
- data/lib/rsssf/prepare/convert-links.rb +77 -0
- data/lib/rsssf/prepare/convert-meta.rb +111 -0
- data/lib/rsssf/prepare/convert-navlines.rb +154 -0
- data/lib/rsssf/prepare/convert-postproc.rb +141 -0
- data/lib/rsssf/prepare/convert.rb +100 -0
- data/lib/rsssf/prepare/download.rb +40 -0
- data/lib/rsssf/project.rb +154 -0
- data/lib/rsssf/reports/page.rb +40 -8
- data/lib/rsssf/reports/schedule.rb +18 -55
- data/lib/rsssf/utils.rb +28 -17
- data/lib/rsssf/version.rb +5 -2
- data/lib/rsssf.rb +53 -13
- metadata +50 -9
- data/lib/rsssf/convert.rb +0 -495
- data/lib/rsssf/repo.rb +0 -144
data/lib/_cocos_.rb
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
###
|
|
2
|
+
## move "upstream" to cocos for sharing
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
##
|
|
6
|
+
## note - use File.file? instead of File.exist?
|
|
7
|
+
## (checks if file exists AND file is a file NOT a directory)
|
|
8
|
+
##
|
|
9
|
+
## todo/fix - add an option to check if file found or not
|
|
10
|
+
## return nil if not found or such
|
|
11
|
+
##
|
|
12
|
+
## use find_file! and find_file or such - why? why not?
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
=begin
|
|
16
|
+
def find_file(name, path: [])
|
|
17
|
+
path.each do |dir|
|
|
18
|
+
full = File.join(dir, name)
|
|
19
|
+
return full if File.exist?(full)
|
|
20
|
+
end
|
|
21
|
+
nil
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def find_file!(name, path: [])
|
|
25
|
+
find_file(name, path:) or
|
|
26
|
+
raise Errno::ENOENT, ""
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
plus add option - raise_on_error: false - why? why not?
|
|
30
|
+
def find_file! - find_file( raise_on_error: false )
|
|
31
|
+
|
|
32
|
+
=end
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def find_file!( name, path: )
|
|
38
|
+
filepath = find_file( name, path: path )
|
|
39
|
+
raise Errorno::ENOENT, "file <#{name}> not found; looking in path #{path.inspect}" if filepath.nil?
|
|
40
|
+
filepath
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
##
|
|
44
|
+
## note - find_file will NOT find directories!!!
|
|
45
|
+
## File.file? will only check if a file (not directory) exits!!
|
|
46
|
+
|
|
47
|
+
def find_file( name, path: )
|
|
48
|
+
return name if File.file?( name )
|
|
49
|
+
|
|
50
|
+
path.each do |dir|
|
|
51
|
+
filepath = File.join( dir, name )
|
|
52
|
+
return filepath if File.file?( filepath )
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
nil ## return nil if not found
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
####
|
|
62
|
+
# parse/find_patterns
|
|
63
|
+
|
|
64
|
+
## use/rename to VARDEF_LINE or such - why? why not?
|
|
65
|
+
VARDEF_RE = %r{\A
|
|
66
|
+
[ ]*
|
|
67
|
+
\$(?<key> [a-z][a-z0-9_]*)
|
|
68
|
+
[ ]*
|
|
69
|
+
=
|
|
70
|
+
[ ]*
|
|
71
|
+
(?<value> .+?) ## eat-up (non-greedy) the rest until end-of-line
|
|
72
|
+
[ ]*
|
|
73
|
+
\z
|
|
74
|
+
}ix
|
|
75
|
+
|
|
76
|
+
VAR_RE = %r{ \$(?<key> [a-z][a-z0-9_]*)
|
|
77
|
+
\b
|
|
78
|
+
}ix
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def read_patterns( path )
|
|
84
|
+
parse_patterns( read_text( path ))
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def parse_patterns( txt )
|
|
88
|
+
|
|
89
|
+
## norm newline (windows cr/lf \r\n) to (lf - \n)
|
|
90
|
+
txt = txt.gsub( /\r\n/, "\n" )
|
|
91
|
+
|
|
92
|
+
### check for line continuations with backslash (\)
|
|
93
|
+
## note - allow spaces before newline
|
|
94
|
+
txt = txt.gsub( /\\[ ]*$\n/, '' )
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
vars = {}
|
|
98
|
+
names = [] # array of lines (with words)
|
|
99
|
+
txt.each_line do |line|
|
|
100
|
+
line = line.strip
|
|
101
|
+
|
|
102
|
+
next if line.empty?
|
|
103
|
+
next if line.start_with?( '#' ) ## skip comments too
|
|
104
|
+
|
|
105
|
+
break if line == '__END__'
|
|
106
|
+
|
|
107
|
+
## strip inline (until end-of-line) comments too
|
|
108
|
+
## e.g. Janvier Janv Jan ## check janv in use??
|
|
109
|
+
## => Janvier Janv Jan
|
|
110
|
+
|
|
111
|
+
line = line.sub( /#.*/, '' ).strip
|
|
112
|
+
## pp line
|
|
113
|
+
|
|
114
|
+
###
|
|
115
|
+
## check for variable defs
|
|
116
|
+
if m = VARDEF_RE.match( line )
|
|
117
|
+
vars[ m[:key].downcase ] = m[:value ]
|
|
118
|
+
next
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
line = line.gsub( VAR_RE ) do |_|
|
|
122
|
+
m = Regexp.last_match
|
|
123
|
+
key = m[:key].downcase
|
|
124
|
+
|
|
125
|
+
value = vars[key]
|
|
126
|
+
raise ArgumentError, "subvars - no vardef found for key >#{key}<" if value.nil?
|
|
127
|
+
value
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
### use squish - remove more than one inline space
|
|
131
|
+
line = line.gsub( /[ ]{2,}/, ' ' )
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
## open paren (use for grouping to non-capture grouping) e.g.
|
|
135
|
+
## () => (?: )
|
|
136
|
+
## note - do NOT replace escaped /( !!!
|
|
137
|
+
## e.g. playoffs (liguilla)
|
|
138
|
+
line = line.gsub( / ## negative lookbehind
|
|
139
|
+
(?<! \\)
|
|
140
|
+
\(
|
|
141
|
+
/x, '(?: ')
|
|
142
|
+
|
|
143
|
+
## expand space shortcuts
|
|
144
|
+
## replace Middle Dot (·) Unicode: U+00B7 or
|
|
145
|
+
## White Square (□) Unicode: U+25A1 or
|
|
146
|
+
## White Small Square (▫) Unicode: U+25AB
|
|
147
|
+
## Open Box (␣) Unicode: U+2423 or
|
|
148
|
+
##
|
|
149
|
+
## add more - why? why not?
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
line = line.gsub( /[·□▫␣]/, ' [ ] ' )
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
names << line
|
|
156
|
+
end
|
|
157
|
+
names
|
|
158
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
class PageConverter
|
|
4
|
+
|
|
5
|
+
## convenience helper
|
|
6
|
+
def self.convert( html, url: )
|
|
7
|
+
@@converter ||= new ## use a "shared" built-in converter
|
|
8
|
+
@@converter.convert( html, url: url )
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
##
|
|
15
|
+
## add anchor: options or such
|
|
16
|
+
## lets you toggle adding anchors (§premier etc.) - why? why not?
|
|
17
|
+
|
|
18
|
+
def convert( html, url: )
|
|
19
|
+
### todo/fix: first check if html is all ascii-7bit e.g.
|
|
20
|
+
## includes only chars from 64 to 127!!!
|
|
21
|
+
|
|
22
|
+
## normalize newlines
|
|
23
|
+
## replace \r\n (form feed \r) used by Windows - ff+lf;
|
|
24
|
+
## just use \n (new line a.k.a. line feed)
|
|
25
|
+
html = html.gsub( "\r\n", "\n" )
|
|
26
|
+
|
|
27
|
+
## convert tabs to two spaces (or use four??)
|
|
28
|
+
html = html.gsub( "\t", ' ' )
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
html = convert_html_entities( html, url: url )
|
|
33
|
+
|
|
34
|
+
###################################
|
|
35
|
+
### smart quotes quick fixes
|
|
36
|
+
### convert all "smart" quote to (standard) single and double quotes
|
|
37
|
+
## D´Alessandro => D'Alessandro
|
|
38
|
+
## 81´ and 88' => 81' and 88'
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
html = html.gsub( /[´’‘]/, "'" )
|
|
42
|
+
html = html.gsub( /[“”]/, '"' )
|
|
43
|
+
|
|
44
|
+
### convert fancy (unicode) dashes/hyphens to plain dash/hyphen
|
|
45
|
+
html = html.gsub( '–', '-' )
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
txt = html_to_txt( html, url: url )
|
|
50
|
+
txt
|
|
51
|
+
end ## method convert
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
###################
|
|
57
|
+
# more helpers
|
|
58
|
+
def self.log( msg )
|
|
59
|
+
## append msg to ./logs.txt
|
|
60
|
+
## use ./errors.txt - why? why not?
|
|
61
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
|
62
|
+
f.write( msg )
|
|
63
|
+
f.write( "\n" )
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
def log( msg ) self.class.log( msg ); end
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
end # module PageConverter
|
|
71
|
+
end # module Rsssf
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
##
|
|
6
|
+
## todo/fix/fix/fix
|
|
7
|
+
## add filenames/urls for quick fixes!!!
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def self.errata_html( html )
|
|
11
|
+
## auto-fix known typos / errors
|
|
12
|
+
### kind of PRE-processing, see errata_txt for POST-processing
|
|
13
|
+
### check - rename to errata_pre/post - why? why not?
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## quick fix - rm </ADDRESS>
|
|
17
|
+
## </ADDRESS>
|
|
18
|
+
## tablesb/braz94.html
|
|
19
|
+
html = html.gsub( '</ADDRESS>', '' )
|
|
20
|
+
|
|
21
|
+
## quick fix </a => </a>
|
|
22
|
+
## <a href="#play6">Gold League (Calle 6)</a
|
|
23
|
+
## <a href="#zpl">PBZ Premier League 2025/26</a
|
|
24
|
+
## <a href="#lig1">Championnat National Ligue 1</a
|
|
25
|
+
|
|
26
|
+
html = html.gsub( /<\/A
|
|
27
|
+
(?! [ ]*>) ## negative lookahead
|
|
28
|
+
/ix, '</A>' )
|
|
29
|
+
|
|
30
|
+
## quick fix </br> => <br>
|
|
31
|
+
html = html.gsub( /<\/BR>/i, '<BR>' )
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## quick fix - change typo <H1></H2>
|
|
36
|
+
## tables/58full.html
|
|
37
|
+
html = html.gsub( '<H1>Quarterfinals</H2>', '<H2>Quarterfinals</H2>' )
|
|
38
|
+
|
|
39
|
+
## quick fix - change typo <M>,<N> to <B>
|
|
40
|
+
## tables/54full.html
|
|
41
|
+
html = html.gsub( '<M>MEX</B>', '<B>MEX</B>' )
|
|
42
|
+
## tables/58full.html
|
|
43
|
+
html = html.gsub( '<N>CZE</B>', '<B>CZE</B>' )
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
## quick fix -
|
|
47
|
+
## tablesb/braz88.html
|
|
48
|
+
html = html.gsub( '<</TITLE>', '</TITLE>' )
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
## quick fix
|
|
52
|
+
## hr (horizontal rule) via img
|
|
53
|
+
## in tables/30full.html and others
|
|
54
|
+
##
|
|
55
|
+
## <IMG SRC="xshadow.gif.pagespeed.ic.AbdeNVcmzw.png" ALT="-----------">
|
|
56
|
+
## look for
|
|
57
|
+
## <IMG ALT="---">
|
|
58
|
+
html = html.gsub( /<IMG
|
|
59
|
+
[^>]+?
|
|
60
|
+
ALT="-{3,}"
|
|
61
|
+
>/ixm, '<HR>' )
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
html
|
|
65
|
+
end
|
|
66
|
+
def errata_html( html ) self.class.errata_html( html ); end
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def self.errata_html_entities( html )
|
|
70
|
+
########
|
|
71
|
+
## typos / autofix - keep - why? why not?
|
|
72
|
+
html = html.gsub( "&oulm;", 'ö' ) ## support typo in entity (ö)
|
|
73
|
+
html = html.gsub( "¨", 'ü' ) ## support typo in entity (ü) - why? why not?
|
|
74
|
+
html = html.gsub( "&slig;", "ß" ) ## support typo in entity (ß)
|
|
75
|
+
html = html.gsub( "&aaacute;", "á" ) ## typo for á
|
|
76
|
+
html = html.gsub( "&nitlde;", "ñ" ) ## typ for ñ
|
|
77
|
+
html
|
|
78
|
+
end
|
|
79
|
+
def errata_html_entities( html ) self.class.errata_html_entities( html ); end
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def errata_txt( txt )
|
|
84
|
+
## kind-of POST-processing, see errata_html for PRE-processing
|
|
85
|
+
|
|
86
|
+
## quick fix - squish spaces (to single)
|
|
87
|
+
## tables/82full.html
|
|
88
|
+
txt = txt.gsub( 'Second phase', 'Second phase' )
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
## quick fix - add (missing) closing bracket (])
|
|
92
|
+
## tables/70q.html
|
|
93
|
+
txt = txt.gsub(/^South America Group 10 \[Brazil$/,
|
|
94
|
+
'South America Group 10 [Brazil]' )
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
txt
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
end # module PageConverter
|
|
102
|
+
end # module Rsssf
|
|
103
|
+
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
class PageConverter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
ENTITIES = %w[
|
|
8
|
+
À À
|
|
9
|
+
Á Á
|
|
10
|
+
 Â
|
|
11
|
+
à Ã
|
|
12
|
+
Ä Ä
|
|
13
|
+
Å Å
|
|
14
|
+
|
|
15
|
+
à à
|
|
16
|
+
á á
|
|
17
|
+
â â
|
|
18
|
+
ã ã
|
|
19
|
+
ä ä
|
|
20
|
+
å å
|
|
21
|
+
Æ Æ
|
|
22
|
+
æ æ
|
|
23
|
+
ß ß
|
|
24
|
+
Ç Ç
|
|
25
|
+
ç ç
|
|
26
|
+
È È
|
|
27
|
+
É É
|
|
28
|
+
Ê Ê
|
|
29
|
+
Ë Ë
|
|
30
|
+
è è
|
|
31
|
+
é é
|
|
32
|
+
ê ê
|
|
33
|
+
ë ë
|
|
34
|
+
|
|
35
|
+
ð ð
|
|
36
|
+
|
|
37
|
+
Ì Ì
|
|
38
|
+
Í Í
|
|
39
|
+
Î Î
|
|
40
|
+
Ï Ï
|
|
41
|
+
ì ì
|
|
42
|
+
í í
|
|
43
|
+
î î
|
|
44
|
+
ï ï
|
|
45
|
+
Ñ Ñ
|
|
46
|
+
ñ ñ
|
|
47
|
+
Ò Ò
|
|
48
|
+
Ó Ó
|
|
49
|
+
Ô Ô
|
|
50
|
+
Õ Õ
|
|
51
|
+
Ö Ö
|
|
52
|
+
ò ò
|
|
53
|
+
ó ó
|
|
54
|
+
ô ô
|
|
55
|
+
õ õ
|
|
56
|
+
ö ö
|
|
57
|
+
Ø Ø
|
|
58
|
+
ø ø
|
|
59
|
+
Ù Ù
|
|
60
|
+
Ú Ú
|
|
61
|
+
Û Û
|
|
62
|
+
Ü Ü
|
|
63
|
+
ù ù
|
|
64
|
+
ú ú
|
|
65
|
+
û û
|
|
66
|
+
ü ü
|
|
67
|
+
Ý Ý
|
|
68
|
+
ý ý
|
|
69
|
+
ÿ ÿ
|
|
70
|
+
|
|
71
|
+
< <
|
|
72
|
+
> >
|
|
73
|
+
& &
|
|
74
|
+
© ©
|
|
75
|
+
® ®
|
|
76
|
+
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def self.convert_html_entities( html, url: nil )
|
|
82
|
+
## check for html entities
|
|
83
|
+
html = html.gsub( "ä", 'ä' )
|
|
84
|
+
html = html.gsub( "ö", 'ö' )
|
|
85
|
+
html = html.gsub( "ü", 'ü' )
|
|
86
|
+
html = html.gsub( "Ä", 'Ä' )
|
|
87
|
+
html = html.gsub( "Ö", 'Ö' )
|
|
88
|
+
html = html.gsub( "Ü", 'Ü' )
|
|
89
|
+
html = html.gsub( "ß", 'ß' )
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
html = errata_html_entities( html )
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
ENTITIES.each_slice(2) do |str, entity|
|
|
96
|
+
html = html.gsub( entity, str )
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
##############
|
|
102
|
+
## check for more entities
|
|
103
|
+
## limit &---; to length 10 - why? why not?
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
## check for decimal entities (mapping 1:1 to unicode)
|
|
107
|
+
html = html.gsub(/&#(\d+);/) do |match|
|
|
108
|
+
uni = if match == 'ij' ## use like Van Dijk -> Van Dijk
|
|
109
|
+
'ij'
|
|
110
|
+
else
|
|
111
|
+
[$1.to_i].pack("U")
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
##puts " converting numeric html entity #{match} to unicode char #{uni}"
|
|
115
|
+
|
|
116
|
+
uni
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
html = html.gsub( /&[^; ]{1,10};/) do |match|
|
|
121
|
+
## ignore weird edge case of &A;
|
|
122
|
+
## e.g. [M&A; moved from pool B] - where M&A is name of club
|
|
123
|
+
##
|
|
124
|
+
## in ital03.html:
|
|
125
|
+
### [Eugenio Corini 22pen&36pen; Christian Vieri 69]
|
|
126
|
+
## Francesco Totti 31, Vincenzo Montella 49&68; Antonio Di Natale 11]
|
|
127
|
+
|
|
128
|
+
if match == '&A;' ||
|
|
129
|
+
match == '&36pen;' || match == '&68;'
|
|
130
|
+
else
|
|
131
|
+
msg = "found unencoded html entity #{match}"
|
|
132
|
+
msg += " in >#{url}<" if url
|
|
133
|
+
|
|
134
|
+
puts "*** WARN - #{msg}"
|
|
135
|
+
log( msg ) ## log too (see log.txt)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
match ## pass through as is (1:1)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
html
|
|
142
|
+
end
|
|
143
|
+
def convert_html_entities( html, url: nil ) self.class.convert_html_entities( html, url: url ); end
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
end # module PageConverter
|
|
149
|
+
end # module Rsssf
|
|
150
|
+
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def beautify_anchors( html )
|
|
6
|
+
## beautify
|
|
7
|
+
## ‹§2fin›
|
|
8
|
+
##
|
|
9
|
+
## == Semifinals
|
|
10
|
+
##
|
|
11
|
+
## merge anchor (a name) with heading into one line e.g.
|
|
12
|
+
## =>
|
|
13
|
+
## == Semifinals ‹§2fin›
|
|
14
|
+
|
|
15
|
+
html = html.gsub( /\s*
|
|
16
|
+
(?<name>‹§
|
|
17
|
+
[^›]+?
|
|
18
|
+
›)
|
|
19
|
+
\s*
|
|
20
|
+
(?<heading>={2,}
|
|
21
|
+
[^=\n]+?
|
|
22
|
+
)
|
|
23
|
+
\n
|
|
24
|
+
\s*/ixm ) do |match|
|
|
25
|
+
|
|
26
|
+
m = Regexp.last_match
|
|
27
|
+
|
|
28
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
29
|
+
puts " mergeing anchor (a name) with heading into one line - >#{match}<"
|
|
30
|
+
|
|
31
|
+
"\n\n#{m[:heading]} #{m[:name]}\n\n"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
###
|
|
35
|
+
##
|
|
36
|
+
## beautify
|
|
37
|
+
## ‹§argsquad›Argentine Squad Full Info
|
|
38
|
+
## ‹§eng›ENGLAND
|
|
39
|
+
##
|
|
40
|
+
##
|
|
41
|
+
## reformat anchor (a name) start line with text e.g.
|
|
42
|
+
## =>
|
|
43
|
+
## Argentine Squad Full Info ‹§argsquad›
|
|
44
|
+
## ENGLAND ‹§eng›
|
|
45
|
+
|
|
46
|
+
html = html.gsub( /\n
|
|
47
|
+
(?<name>‹§
|
|
48
|
+
[^›]+?
|
|
49
|
+
›)
|
|
50
|
+
[ ]*
|
|
51
|
+
(?<text>[^\n]+?
|
|
52
|
+
)
|
|
53
|
+
\n
|
|
54
|
+
/ixm ) do |match|
|
|
55
|
+
|
|
56
|
+
m = Regexp.last_match
|
|
57
|
+
|
|
58
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
59
|
+
puts " move anchor (a name) starting line with text to end - >#{match}<"
|
|
60
|
+
|
|
61
|
+
"\n#{m[:text]} #{m[:name]}\n"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
###
|
|
65
|
+
## beautify heading
|
|
66
|
+
## ==== ‹§gra›Group A
|
|
67
|
+
## =>
|
|
68
|
+
## ==== Group A ‹§gra›
|
|
69
|
+
|
|
70
|
+
html = html.gsub( /\n
|
|
71
|
+
(?<heading_marker>
|
|
72
|
+
={2,})
|
|
73
|
+
[ ]*
|
|
74
|
+
(?<name>‹§
|
|
75
|
+
[^›]+?
|
|
76
|
+
›)
|
|
77
|
+
[ ]*
|
|
78
|
+
(?<heading_text>[^\n]+?
|
|
79
|
+
)
|
|
80
|
+
\n
|
|
81
|
+
/ixm ) do |match|
|
|
82
|
+
|
|
83
|
+
m = Regexp.last_match
|
|
84
|
+
|
|
85
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
86
|
+
puts " move anchor (a name) in heading to end - >#{match}<"
|
|
87
|
+
|
|
88
|
+
"\n#{m[:heading_marker]} #{m[:heading_text]} #{m[:name]}\n"
|
|
89
|
+
end
|
|
90
|
+
html
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
end # module PageConverter
|
|
95
|
+
end # module Rsssf
|
|
96
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
###
|
|
4
|
+
## <b><a name="fall">Opening Season 2024</a></b> => <hb> ... </hb>
|
|
5
|
+
## <u><a name="fplay">Playoff Stage</a></u> => <hu> ... </hu>
|
|
6
|
+
##
|
|
7
|
+
## (inofficial) heading "bold", heading "underscore"
|
|
8
|
+
## note - MUST be one single "stand-alone" line (in pre block) !!!
|
|
9
|
+
|
|
10
|
+
=begin
|
|
11
|
+
BU_ANAME_LINE_RE = %r{^ [ ]* < (?<tag>B|U) >
|
|
12
|
+
[ ]* (?<text>
|
|
13
|
+
<A [ ]+ NAME
|
|
14
|
+
.+?
|
|
15
|
+
</A>
|
|
16
|
+
)
|
|
17
|
+
[ ]* </ \k<tag> >
|
|
18
|
+
[ ]*
|
|
19
|
+
$}ix
|
|
20
|
+
=end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
## scan for now only (do NOT replace)
|
|
24
|
+
BOLD_OR_UNDERLINE_LINE_RE = %r{^ [ ]* < (?<tag> [BU]) >
|
|
25
|
+
[ ]* (?<text>
|
|
26
|
+
.+? ## note - use non-greedy match
|
|
27
|
+
)
|
|
28
|
+
[ ]* </ \k<tag> >
|
|
29
|
+
[ ]*
|
|
30
|
+
$}ix
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def make_heading( html )
|
|
35
|
+
edits = []
|
|
36
|
+
|
|
37
|
+
html = html.gsub( BOLD_OR_UNDERLINE_LINE_RE ) do |match|
|
|
38
|
+
m = Regexp.last_match
|
|
39
|
+
|
|
40
|
+
tag = m[:tag].downcase
|
|
41
|
+
text = m[:text]
|
|
42
|
+
|
|
43
|
+
if text.downcase.start_with?( '<a name' )
|
|
44
|
+
msg = "make heading (h#{tag}) out of #{tag}-enclosed a name in line >#{text}<"
|
|
45
|
+
puts " #{msg}"
|
|
46
|
+
|
|
47
|
+
## note - edit line MUST start with --
|
|
48
|
+
## might be multi-line
|
|
49
|
+
edits << "-- #{msg}"
|
|
50
|
+
|
|
51
|
+
"<h#{tag}>#{text}</h#{tag}>"
|
|
52
|
+
else
|
|
53
|
+
## note - skip (false positive) copyright line (in about this document)
|
|
54
|
+
## (C) Copyright RSSSF
|
|
55
|
+
## Copyright
|
|
56
|
+
if %r{copyright}i.match?( text )
|
|
57
|
+
else
|
|
58
|
+
msg = "found #{tag}-enclosed line >#{text}< - heading?"
|
|
59
|
+
puts " #{msg}"
|
|
60
|
+
|
|
61
|
+
edits << "-- #{msg}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
match ## keep as is (do NOT change)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
[html, edits]
|
|
70
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
EMAIL_RE = %r{ \s*
|
|
6
|
+
\(
|
|
7
|
+
[a-z][a-z0-9_]+
|
|
8
|
+
@[a-z]+(\.[a-z]+)+
|
|
9
|
+
\)
|
|
10
|
+
}imx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def remove_emails( html )
|
|
14
|
+
### remove converted ("blinded") mailto anchors
|
|
15
|
+
## note usually inside () e.g.
|
|
16
|
+
## (‹mailto›)
|
|
17
|
+
## plus slurp up all leading whitespace (incl. newline) - why? why not?
|
|
18
|
+
html = html.gsub( /\s*
|
|
19
|
+
\(‹mailto›\)
|
|
20
|
+
/xm, '' )
|
|
21
|
+
|
|
22
|
+
###
|
|
23
|
+
## remove "regular emails too e.g.
|
|
24
|
+
##
|
|
25
|
+
## Thanks to Marcelo Leme de Arruda (___@___.__.br),
|
|
26
|
+
## Ricardo FF Pontes (___@____.com),
|
|
27
|
+
## Santiago Reis (____@____.com.br),
|
|
28
|
+
## Marcos Lacerda Queiroz (___@____.com.br)
|
|
29
|
+
## etc.
|
|
30
|
+
|
|
31
|
+
## check for "free-standing e.g. on its own line" emails only for now
|
|
32
|
+
html = html.gsub( EMAIL_RE ) do |match|
|
|
33
|
+
puts "removing email >#{match}<"
|
|
34
|
+
''
|
|
35
|
+
end
|
|
36
|
+
html
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
end # module PageConverter
|
|
42
|
+
end # module Rsssf
|
|
43
|
+
|