rsssf 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +4 -0
- data/Manifest.txt +41 -7
- data/README.md +93 -71
- data/Rakefile +8 -7
- data/config/groups_en.txt +44 -0
- data/config/rounds_en.txt +283 -0
- data/config/rounds_es.txt +20 -0
- data/config/rounds_misc.txt +7 -0
- data/lib/_cocos_.rb +158 -0
- data/lib/rsssf/convert/convert.rb +71 -0
- data/lib/rsssf/convert/errata.rb +103 -0
- data/lib/rsssf/convert/html_entities.rb +150 -0
- data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
- data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
- data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
- data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
- data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
- data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
- data/lib/rsssf/convert/html_to_txt.rb +247 -0
- data/lib/rsssf/download.rb +20 -0
- data/lib/rsssf/fmtfix/dates.rb +541 -0
- data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
- data/lib/rsssf/fmtfix/errata.rb +44 -0
- data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
- data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
- data/lib/rsssf/fmtfix/goals.rb +173 -0
- data/lib/rsssf/fmtfix/headers.rb +326 -0
- data/lib/rsssf/fmtfix/outline.rb +228 -0
- data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
- data/lib/rsssf/fmtfix/rounds.rb +74 -0
- data/lib/rsssf/fmtfix/score.rb +92 -0
- data/lib/rsssf/fmtfix/tables.rb +316 -0
- data/lib/rsssf/fmtfix/topscorers.rb +50 -0
- data/lib/rsssf/page-find_schedule.rb +127 -0
- data/lib/rsssf/page-meta.rb +68 -0
- data/lib/rsssf/page.rb +125 -238
- data/lib/rsssf/parse_schedules.rb +34 -0
- data/lib/rsssf/prepare/convert-links.rb +77 -0
- data/lib/rsssf/prepare/convert-meta.rb +111 -0
- data/lib/rsssf/prepare/convert-navlines.rb +154 -0
- data/lib/rsssf/prepare/convert-postproc.rb +141 -0
- data/lib/rsssf/prepare/convert.rb +100 -0
- data/lib/rsssf/prepare/download.rb +40 -0
- data/lib/rsssf/project.rb +154 -0
- data/lib/rsssf/reports/page.rb +66 -23
- data/lib/rsssf/reports/schedule.rb +89 -40
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +37 -45
- data/lib/rsssf/version.rb +7 -6
- data/lib/rsssf.rb +82 -19
- metadata +68 -26
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/lib/rsssf/repo.rb +0 -220
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
EMAIL_RE = %r{ \s*
|
|
6
|
+
\(
|
|
7
|
+
[a-z][a-z0-9_]+
|
|
8
|
+
@[a-z]+(\.[a-z]+)+
|
|
9
|
+
\)
|
|
10
|
+
}imx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def remove_emails( html )
|
|
14
|
+
### remove converted ("blinded") mailto anchors
|
|
15
|
+
## note usually inside () e.g.
|
|
16
|
+
## (‹mailto›)
|
|
17
|
+
## plus slurp up all leading whitespace (incl. newline) - why? why not?
|
|
18
|
+
html = html.gsub( /\s*
|
|
19
|
+
\(‹mailto›\)
|
|
20
|
+
/xm, '' )
|
|
21
|
+
|
|
22
|
+
###
|
|
23
|
+
## remove "regular emails too e.g.
|
|
24
|
+
##
|
|
25
|
+
## Thanks to Marcelo Leme de Arruda (___@___.__.br),
|
|
26
|
+
## Ricardo FF Pontes (___@____.com),
|
|
27
|
+
## Santiago Reis (____@____.com.br),
|
|
28
|
+
## Marcos Lacerda Queiroz (___@____.com.br)
|
|
29
|
+
## etc.
|
|
30
|
+
|
|
31
|
+
## check for "free-standing e.g. on its own line" emails only for now
|
|
32
|
+
html = html.gsub( EMAIL_RE ) do |match|
|
|
33
|
+
puts "removing email >#{match}<"
|
|
34
|
+
''
|
|
35
|
+
end
|
|
36
|
+
html
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
end # module PageConverter
|
|
42
|
+
end # module Rsssf
|
|
43
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
# <a href="#sa">Série A</a><br>
|
|
5
|
+
#
|
|
6
|
+
# <A href="http://www.rsssf.org/">Rec.Sport.Soccer
|
|
7
|
+
# Statistics Foundation</A>
|
|
8
|
+
# <A href="http://www.rsssfbrasil.com">RSSSF
|
|
9
|
+
# Brazil</A>
|
|
10
|
+
#
|
|
11
|
+
# and Daniel Dalence (<A
|
|
12
|
+
# href="mailto:danielballack@terra.com.br">danielballack@terra.com.br</A>)
|
|
13
|
+
##
|
|
14
|
+
##
|
|
15
|
+
## empty
|
|
16
|
+
## <a>Primer Descenso – First Relegation</a>
|
|
17
|
+
|
|
18
|
+
A_HREF_RE = %r{<A
|
|
19
|
+
(?:
|
|
20
|
+
\s+ HREF [ ]* =
|
|
21
|
+
(?<href>[^>]+?)
|
|
22
|
+
)?
|
|
23
|
+
>
|
|
24
|
+
(?<title>.+?)
|
|
25
|
+
<\/A>
|
|
26
|
+
}imx
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def replace_a_href( html )
|
|
30
|
+
## remove anchors (a href)
|
|
31
|
+
# note: heading 4 includes anchor (thus, let anchors go first)
|
|
32
|
+
# note: <a \newline href is used for authors email - thus incl. support for newline as space
|
|
33
|
+
html.gsub( A_HREF_RE ) do |match| ## note: use .+? non-greedy match
|
|
34
|
+
m = Regexp.last_match
|
|
35
|
+
captures = m.named_captures
|
|
36
|
+
href = if m['href']
|
|
37
|
+
m['href'].gsub( /["']/, '' ).strip ## remove ("" or '')
|
|
38
|
+
else
|
|
39
|
+
nil
|
|
40
|
+
end
|
|
41
|
+
title = m['title'].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if href.nil?
|
|
45
|
+
## report error - <a>hello</a> is useless
|
|
46
|
+
puts " replace anchor w/ missing (!!) href (a) >#{title}<"
|
|
47
|
+
"‹#{squish(title)}›"
|
|
48
|
+
|
|
49
|
+
## e.g.
|
|
50
|
+
## ‹Larsen23@gmx.de, see page mailto:Larsen23@gmx.de›
|
|
51
|
+
## ‹danielballack@terra.com.br, see page mailto:danielballack@terra.com.br›
|
|
52
|
+
## ‹zja70@aol.com, see page mailto:zja70@aol.com›)
|
|
53
|
+
|
|
54
|
+
elsif href.start_with?( 'mailto:')
|
|
55
|
+
puts " blank mailto - anchor (a) href >#{href}, >#{title}<"
|
|
56
|
+
'‹mailto›' ## delete/remove email
|
|
57
|
+
else
|
|
58
|
+
puts " replace anchor (a) href >#{href}, >#{title}<"
|
|
59
|
+
|
|
60
|
+
## convert href to xref
|
|
61
|
+
xref = if href.start_with?('#') ## in-page ref
|
|
62
|
+
", see §#{href[1..-1]}"
|
|
63
|
+
elsif href.start_with?( /https?:/ ) ## external page ref
|
|
64
|
+
## skip - keep empty - why? why not? (or add url domain?)
|
|
65
|
+
''
|
|
66
|
+
else
|
|
67
|
+
## hack - check for some custom excludes
|
|
68
|
+
if title.start_with?( 'Rec.Sport.Soccer' )
|
|
69
|
+
## skip - keep empty
|
|
70
|
+
''
|
|
71
|
+
else
|
|
72
|
+
## strip (ending) .htm|html
|
|
73
|
+
", see page #{href.sub( /\.html?$/,'')}"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
"‹#{squish(title)}#{xref}›"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
end # module PageConverter
|
|
85
|
+
end # module Rsssf
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# <a name="sa">Série A</a>
|
|
6
|
+
# <a name="sd">Série D</a>
|
|
7
|
+
|
|
8
|
+
# <A name=about>
|
|
9
|
+
# <H2>About this document</H2></A>
|
|
10
|
+
# => change to (possible?)
|
|
11
|
+
# <H2><A name=about>About this document</A></H2>
|
|
12
|
+
#
|
|
13
|
+
#
|
|
14
|
+
# <h4><a name="cb">Copa do Brasil</a></h4>
|
|
15
|
+
|
|
16
|
+
## note - for content use non-greedy to allow
|
|
17
|
+
## match of tags inside content too
|
|
18
|
+
|
|
19
|
+
A_NAME_OLD_RE = %r{<A [ ]+ NAME [ ]* =
|
|
20
|
+
(?<name>[^>]+?)
|
|
21
|
+
>
|
|
22
|
+
(?<title>.+?)
|
|
23
|
+
</A>
|
|
24
|
+
}imx
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
A_NAME_RE = %r{<A [ ]+ NAME [ ]* =
|
|
28
|
+
(?<name>[^>]+?)
|
|
29
|
+
>
|
|
30
|
+
}imx
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def replace_a_name_old( html )
|
|
34
|
+
##
|
|
35
|
+
## remove (named) anchors
|
|
36
|
+
html.gsub( A_NAME_RE ) do |match| ## note: use .+? non-greedy match
|
|
37
|
+
m = Regexp.last_match
|
|
38
|
+
name = m[:name].gsub( /["']/, '' ).strip ## remove ("" or '')
|
|
39
|
+
title = m[:title].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
|
40
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
41
|
+
puts " replace anchor (a) name >#{name}<, >#{title}< - >#{match}<"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
## todo - report WARN if title incl. tags
|
|
46
|
+
## assumes text only for now - why? why not?
|
|
47
|
+
## add a name inside heading !!!
|
|
48
|
+
## do NOT add heading inside a name !!!
|
|
49
|
+
|
|
50
|
+
"#{title} ‹§#{name}›" ## note - use two spaces min (between title & name)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def replace_a_name( html )
|
|
57
|
+
|
|
58
|
+
## note - allows <a name=""> without closing </a>
|
|
59
|
+
## <a name="semi"><H2>Semifinals</H2>
|
|
60
|
+
## always put anchor on its own line for now
|
|
61
|
+
|
|
62
|
+
##
|
|
63
|
+
## remove (named) anchors
|
|
64
|
+
html.gsub( A_NAME_RE ) do |match| ## note: use .+? non-greedy match
|
|
65
|
+
m = Regexp.last_match
|
|
66
|
+
|
|
67
|
+
name = m[:name].gsub( /["']/, '' ).strip ## remove ("" or '')
|
|
68
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
69
|
+
puts " replace anchor (a) name >#{name}< - >#{match}<"
|
|
70
|
+
|
|
71
|
+
##
|
|
72
|
+
## todo - report WARN if title incl. tags
|
|
73
|
+
## assumes text only for now - why? why not?
|
|
74
|
+
## add a name inside heading !!!
|
|
75
|
+
## do NOT add heading inside a name !!!
|
|
76
|
+
|
|
77
|
+
"‹§#{name}›" ## note - use two spaces min (between title & name)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
end # module PageConverter
|
|
85
|
+
end # module Rsssf
|
|
86
|
+
|
|
87
|
+
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
## note - for h1,h2,h3,h4,h5,h6
|
|
6
|
+
## use a backref(erence) e.g. \1
|
|
7
|
+
##
|
|
8
|
+
## note - include leading and trailing spaces (incl. newlines) !!!
|
|
9
|
+
##
|
|
10
|
+
## note - for content inside use non-greedy to allow
|
|
11
|
+
## match of tags inside content too
|
|
12
|
+
HEADING_RE = %r{ \s*
|
|
13
|
+
<H(?<level>[1-6])>
|
|
14
|
+
(?<title> .+?)
|
|
15
|
+
</H\k<level>>
|
|
16
|
+
\s*
|
|
17
|
+
}imx
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
###
|
|
22
|
+
## note - MUST be a one a single line (see make heading for more)
|
|
23
|
+
## e.g. "<h#{tag}>#{text}</h#{tag}>"
|
|
24
|
+
BOLD_OR_UNDERLINE_LINE_HEADING_RE = %r{^
|
|
25
|
+
[ ]*
|
|
26
|
+
<H (?<tag> [BU]) >
|
|
27
|
+
(?<title> .+?)
|
|
28
|
+
</H \k<tag> >
|
|
29
|
+
[ ]*
|
|
30
|
+
$
|
|
31
|
+
}ix
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def replace_heading( html )
|
|
35
|
+
html = html.gsub( HEADING_RE ) do |_|
|
|
36
|
+
m = Regexp.last_match
|
|
37
|
+
|
|
38
|
+
level = m[:level].to_i(10)
|
|
39
|
+
title = m[:title]
|
|
40
|
+
|
|
41
|
+
puts " replace heading #{level} (h#{level}) >#{title}<"
|
|
42
|
+
|
|
43
|
+
## note: make sure to always add two newlines before and after
|
|
44
|
+
"\n\n#{'='*level} #{title}\n\n"
|
|
45
|
+
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
html = html.gsub( BOLD_OR_UNDERLINE_LINE_HEADING_RE ) do |_|
|
|
49
|
+
m = Regexp.last_match
|
|
50
|
+
|
|
51
|
+
tag = m[:tag].downcase
|
|
52
|
+
title = m[:title]
|
|
53
|
+
|
|
54
|
+
## use heading 5 for b and heading 6 for underline for now
|
|
55
|
+
## maybe later change to custom ==_ or ==* or such
|
|
56
|
+
## to mark the heading (sourced via bold/underscore) ???
|
|
57
|
+
level = if tag == 'b'
|
|
58
|
+
5
|
|
59
|
+
elsif tag == 'u'
|
|
60
|
+
6
|
|
61
|
+
else
|
|
62
|
+
raise ArgumentError, "b(old)|u(underscore) tag expected; got #{tag}"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
puts " replace #{tag}-heading #{level} (h#{level}) >#{title}<"
|
|
66
|
+
|
|
67
|
+
## note: do NOT add any newlines before and after
|
|
68
|
+
"#{'='*level} #{title}"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
html
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
end # module PageConverter
|
|
76
|
+
end # module Rsssf
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
HR_LINE_ASCII = "\n\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n"
|
|
8
|
+
|
|
9
|
+
def replace_hr( html )
|
|
10
|
+
|
|
11
|
+
html = html.gsub( /\s*<HR>\s*/im ) do |match|
|
|
12
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
13
|
+
puts " replace horizontal rule (hr) - >#{match}<"
|
|
14
|
+
HR_LINE_ASCII ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
html
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
end # module PageConverter
|
|
21
|
+
end # module Rsssf
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
module Rsssf
|
|
2
|
+
class PageConverter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def squish( str )
|
|
10
|
+
## squish more than one white space to one space
|
|
11
|
+
str.gsub( /[ \r\t\n]+/, ' ' )
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def html_to_txt( html, url: )
|
|
18
|
+
|
|
19
|
+
###
|
|
20
|
+
# todo: check if any tags (still) present??
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
## cut off everything before body
|
|
24
|
+
##
|
|
25
|
+
## note - might incl. attributes e.g.
|
|
26
|
+
## <body bgcolor="yellow">
|
|
27
|
+
|
|
28
|
+
## record / track (important) edits - sub(stitutions) etc.
|
|
29
|
+
edits = []
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
html = html.sub( /.+?
|
|
33
|
+
<BODY [^>]*? >
|
|
34
|
+
\s*
|
|
35
|
+
/xim,
|
|
36
|
+
'' )
|
|
37
|
+
|
|
38
|
+
## special case i) no <body> - cut-off head if present
|
|
39
|
+
## cut off everything before <head/>
|
|
40
|
+
## used in braz93.html, braz98.html
|
|
41
|
+
html = html.sub( /.+?
|
|
42
|
+
<\/HEAD>
|
|
43
|
+
\s*
|
|
44
|
+
/xim, '' )
|
|
45
|
+
|
|
46
|
+
## special case ii) no <body>, no </head>
|
|
47
|
+
## cut off everything before <head/>
|
|
48
|
+
## used in braz93.html, braz98.html
|
|
49
|
+
html = html.sub( /.+?
|
|
50
|
+
<HEAD\/>
|
|
51
|
+
\s*
|
|
52
|
+
/xim, '' )
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
## cut off everything after body (closing)
|
|
58
|
+
html = html.sub( /<\/BODY>.*/im, '' )
|
|
59
|
+
|
|
60
|
+
## special case
|
|
61
|
+
## cut off everything after </html> (closing)
|
|
62
|
+
## used in braz93.html, braz98.html
|
|
63
|
+
html = html.sub( /<\/HTML>.*/im, '' )
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
## quick fix
|
|
69
|
+
## <title>World Cup 1950 qualifications</title>
|
|
70
|
+
## <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-2">
|
|
71
|
+
|
|
72
|
+
## remove title and meta
|
|
73
|
+
html = html.sub( /<TITLE>.*?<\/TITLE>/i, '' )
|
|
74
|
+
html = html.sub( /<META .*?>/i, '' )
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
## (auto-)fix known types / errors
|
|
78
|
+
## todo - pass in/along filename/url too - why? why not?
|
|
79
|
+
html = errata_html( html )
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
##
|
|
84
|
+
## change ^<b><a name ...></a></b>$ or <hb> - heading "bold" - might be h5
|
|
85
|
+
## ^<u><a name ...></a></u>$ to <hu> - heading "underscore" - might be h6
|
|
86
|
+
html, more_edits = make_heading( html )
|
|
87
|
+
edits += more_edits
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
## remove cite
|
|
95
|
+
html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
|
|
96
|
+
puts " remove cite >#{$1}<"
|
|
97
|
+
"#{$1}"
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
html = replace_hr( html )
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
## replace break (br)
|
|
107
|
+
## note: do NOT use m/multiline for now - why? why not??
|
|
108
|
+
html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
|
|
109
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
110
|
+
puts " replace break (br) - >#{match}<"
|
|
111
|
+
"\n"
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
html = replace_a_name( html )
|
|
118
|
+
|
|
119
|
+
html = replace_a_href( html )
|
|
120
|
+
|
|
121
|
+
## quickfix remove trailing </a> left possibly by a_name
|
|
122
|
+
html = html.gsub( /<\/A>/i, '' )
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
## replace paragrah (p)
|
|
127
|
+
html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
|
|
128
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
|
129
|
+
puts " replace paragraph (p) - >#{match}<"
|
|
130
|
+
"\n\n"
|
|
131
|
+
end
|
|
132
|
+
html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
html = replace_heading( html )
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
## remove i(talics)
|
|
141
|
+
## use non-greedy match as default? e.g. .*? - why? why not?
|
|
142
|
+
## or greedy w/ ([^<]+)
|
|
143
|
+
html = html.gsub( /<I>(.*?)<\/I>/im ) do |_|
|
|
144
|
+
puts " remove italic (i) >#{$1}<"
|
|
145
|
+
"#{$1}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
html = html.gsub( /<U>(.*?)<\/U>/im ) do |_|
|
|
149
|
+
puts " remove underline (u) >#{$1}<"
|
|
150
|
+
"#{$1}"
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
## remove b - note: might include anchors (thus, call after anchors)
|
|
154
|
+
### use non-greedy match as default? e.g. .*? - why? why not?
|
|
155
|
+
## was - "**#{$1}**"
|
|
156
|
+
html = html.gsub( /<B>(.*?)<\/B>/im ) do |_|
|
|
157
|
+
puts " remove bold (b) >#{$1}<"
|
|
158
|
+
"#{$1}"
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
## <strong></strong>
|
|
162
|
+
html = html.gsub( /<STRONG>(.*?)<\/STRONG>/im ) do |_|
|
|
163
|
+
puts " remove strong (strong) >#{$1}<"
|
|
164
|
+
"#{$1}"
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
## replace preformatted (pre)
|
|
170
|
+
html = html.gsub( /<PRE>|<\/PRE>/i ) do |match|
|
|
171
|
+
puts " replace preformatted (pre)"
|
|
172
|
+
|
|
173
|
+
## note - replace preformatted blocks
|
|
174
|
+
## with comments
|
|
175
|
+
## was:
|
|
176
|
+
## '' # replace w/ nothing for now (keep surrounding newlines)
|
|
177
|
+
|
|
178
|
+
if match.downcase == '<pre>'
|
|
179
|
+
'<!-- start pre -->'
|
|
180
|
+
else
|
|
181
|
+
'<!-- end pre -->'
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
=begin
|
|
187
|
+
puts
|
|
188
|
+
puts
|
|
189
|
+
puts "html:"
|
|
190
|
+
puts html[0..2000]
|
|
191
|
+
puts "-- snip --"
|
|
192
|
+
puts html[-1000..-1] ## print last hundred chars
|
|
193
|
+
=end
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
html = remove_emails( html )
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
html = beautify_anchors( html )
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
## check for html tags
|
|
203
|
+
## left
|
|
204
|
+
## use scan instead of
|
|
205
|
+
html.gsub( /<
|
|
206
|
+
\/?
|
|
207
|
+
[A-Z]+ [^>]*
|
|
208
|
+
>
|
|
209
|
+
/xim ) do |match|
|
|
210
|
+
|
|
211
|
+
if ['<menu>', '<ul>', '<li>',
|
|
212
|
+
'</menu>', '</ul>', '</li>'].include?(match.downcase)
|
|
213
|
+
## do nothing
|
|
214
|
+
else
|
|
215
|
+
msg = "found unprocessed html tag #{match} in >#{url}<"
|
|
216
|
+
puts "*** WARN - #{msg}"
|
|
217
|
+
log( msg ) ## log too (see log.txt)
|
|
218
|
+
end
|
|
219
|
+
match
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
##
|
|
224
|
+
## todo/fix
|
|
225
|
+
## move up-front - kind of preprocessing (not post) - why? why not?
|
|
226
|
+
|
|
227
|
+
## cleanup whitespaces
|
|
228
|
+
## todo/fix: convert newline in space first
|
|
229
|
+
## and than collapse spaces etc.!!!
|
|
230
|
+
txt = String.new
|
|
231
|
+
html.each_line do |line|
|
|
232
|
+
line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
|
|
233
|
+
line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
|
|
234
|
+
|
|
235
|
+
txt << line
|
|
236
|
+
txt << "\n"
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
txt = errata_txt( txt )
|
|
240
|
+
|
|
241
|
+
[txt, edits]
|
|
242
|
+
end # method html_to_text
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
end # module PageConverter
|
|
247
|
+
end # module Rsssf
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
|
|
2
|
+
module Rsssf
|
|
3
|
+
|
|
4
|
+
def self.download_page( url, encoding: )
|
|
5
|
+
|
|
6
|
+
## note: assume plain 7-bit ascii for now
|
|
7
|
+
## -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1)
|
|
8
|
+
###-- does NOT use utf-8 character encoding!!!
|
|
9
|
+
response = Webget.page( url, encoding: encoding ) ## fetch (and cache) html page (via HTTP GET)
|
|
10
|
+
|
|
11
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
|
12
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
puts "html:"
|
|
16
|
+
html = response.text( encoding: encoding )
|
|
17
|
+
pp html[0..400]
|
|
18
|
+
html
|
|
19
|
+
end
|
|
20
|
+
end # module Rsssf
|