rsssf 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +5 -5
  2. data/{HISTORY.md → CHANGELOG.md} +4 -0
  3. data/Manifest.txt +41 -7
  4. data/README.md +93 -71
  5. data/Rakefile +8 -7
  6. data/config/groups_en.txt +44 -0
  7. data/config/rounds_en.txt +283 -0
  8. data/config/rounds_es.txt +20 -0
  9. data/config/rounds_misc.txt +7 -0
  10. data/lib/_cocos_.rb +158 -0
  11. data/lib/rsssf/convert/convert.rb +71 -0
  12. data/lib/rsssf/convert/errata.rb +103 -0
  13. data/lib/rsssf/convert/html_entities.rb +150 -0
  14. data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
  15. data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
  16. data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
  17. data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
  18. data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
  19. data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
  20. data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
  21. data/lib/rsssf/convert/html_to_txt.rb +247 -0
  22. data/lib/rsssf/download.rb +20 -0
  23. data/lib/rsssf/fmtfix/dates.rb +541 -0
  24. data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
  25. data/lib/rsssf/fmtfix/errata.rb +44 -0
  26. data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
  27. data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
  28. data/lib/rsssf/fmtfix/goals.rb +173 -0
  29. data/lib/rsssf/fmtfix/headers.rb +326 -0
  30. data/lib/rsssf/fmtfix/outline.rb +228 -0
  31. data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
  32. data/lib/rsssf/fmtfix/rounds.rb +74 -0
  33. data/lib/rsssf/fmtfix/score.rb +92 -0
  34. data/lib/rsssf/fmtfix/tables.rb +316 -0
  35. data/lib/rsssf/fmtfix/topscorers.rb +50 -0
  36. data/lib/rsssf/page-find_schedule.rb +127 -0
  37. data/lib/rsssf/page-meta.rb +68 -0
  38. data/lib/rsssf/page.rb +125 -238
  39. data/lib/rsssf/parse_schedules.rb +34 -0
  40. data/lib/rsssf/prepare/convert-links.rb +77 -0
  41. data/lib/rsssf/prepare/convert-meta.rb +111 -0
  42. data/lib/rsssf/prepare/convert-navlines.rb +154 -0
  43. data/lib/rsssf/prepare/convert-postproc.rb +141 -0
  44. data/lib/rsssf/prepare/convert.rb +100 -0
  45. data/lib/rsssf/prepare/download.rb +40 -0
  46. data/lib/rsssf/project.rb +154 -0
  47. data/lib/rsssf/reports/page.rb +66 -23
  48. data/lib/rsssf/reports/schedule.rb +89 -40
  49. data/lib/rsssf/schedule.rb +4 -14
  50. data/lib/rsssf/utils.rb +37 -45
  51. data/lib/rsssf/version.rb +7 -6
  52. data/lib/rsssf.rb +82 -19
  53. metadata +68 -26
  54. data/.gemtest +0 -0
  55. data/lib/rsssf/fetch.rb +0 -80
  56. data/lib/rsssf/html2txt.rb +0 -157
  57. data/lib/rsssf/patch.rb +0 -28
  58. data/lib/rsssf/repo.rb +0 -220
  59. data/test/helper.rb +0 -12
  60. data/test/test_utils.rb +0 -83
@@ -0,0 +1,43 @@
1
+ module Rsssf
2
+ class PageConverter
3
+
4
+
5
+ EMAIL_RE = %r{ \s*
6
+ \(
7
+ [a-z][a-z0-9_]+
8
+ @[a-z]+(\.[a-z]+)+
9
+ \)
10
+ }imx
11
+
12
+
13
+ def remove_emails( html )
14
+ ### remove converted ("blinded") mailto anchors
15
+ ## note usually inside () e.g.
16
+ ## (‹mailto›)
17
+ ## plus slurp up all leading whitespace (incl. newline) - why? why not?
18
+ html = html.gsub( /\s*
19
+ \(‹mailto›\)
20
+ /xm, '' )
21
+
22
+ ###
23
+ ## remove "regular emails too e.g.
24
+ ##
25
+ ## Thanks to Marcelo Leme de Arruda (___@___.__.br),
26
+ ## Ricardo FF Pontes (___@____.com),
27
+ ## Santiago Reis (____@____.com.br),
28
+ ## Marcos Lacerda Queiroz (___@____.com.br)
29
+ ## etc.
30
+
31
+ ## check for "free-standing e.g. on its own line" emails only for now
32
+ html = html.gsub( EMAIL_RE ) do |match|
33
+ puts "removing email >#{match}<"
34
+ ''
35
+ end
36
+ html
37
+ end
38
+
39
+
40
+
41
+ end # module PageConverter
42
+ end # module Rsssf
43
+
@@ -0,0 +1,85 @@
1
+ module Rsssf
2
+ class PageConverter
3
+
4
+ # <a href="#sa">Série A</a><br>
5
+ #
6
+ # <A href="http://www.rsssf.org/">Rec.Sport.Soccer
7
+ # Statistics Foundation</A>
8
+ # <A href="http://www.rsssfbrasil.com">RSSSF
9
+ # Brazil</A>
10
+ #
11
+ # and Daniel Dalence (<A
12
+ # href="mailto:danielballack@terra.com.br">danielballack@terra.com.br</A>)
13
+ ##
14
+ ##
15
+ ## empty
16
+ ## <a>Primer Descenso – First Relegation</a>
17
+
18
+ A_HREF_RE = %r{<A
19
+ (?:
20
+ \s+ HREF [ ]* =
21
+ (?<href>[^>]+?)
22
+ )?
23
+ >
24
+ (?<title>.+?)
25
+ <\/A>
26
+ }imx
27
+
28
+
29
+ def replace_a_href( html )
30
+ ## remove anchors (a href)
31
+ # note: heading 4 includes anchor (thus, let anchors go first)
32
+ # note: <a \newline href is used for authors email - thus incl. support for newline as space
33
+ html.gsub( A_HREF_RE ) do |match| ## note: use .+? non-greedy match
34
+ m = Regexp.last_match
35
+ captures = m.named_captures
36
+ href = if m['href']
37
+ m['href'].gsub( /["']/, '' ).strip ## remove ("" or '')
38
+ else
39
+ nil
40
+ end
41
+ title = m['title'].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
42
+
43
+
44
+ if href.nil?
45
+ ## report error - <a>hello</a> is useless
46
+ puts " replace anchor w/ missing (!!) href (a) >#{title}<"
47
+ "‹#{squish(title)}›"
48
+
49
+ ## e.g.
50
+ ## ‹Larsen23@gmx.de, see page mailto:Larsen23@gmx.de›
51
+ ## ‹danielballack@terra.com.br, see page mailto:danielballack@terra.com.br›
52
+ ## ‹zja70@aol.com, see page mailto:zja70@aol.com›)
53
+
54
+ elsif href.start_with?( 'mailto:')
55
+ puts " blank mailto - anchor (a) href >#{href}, >#{title}<"
56
+ '‹mailto›' ## delete/remove email
57
+ else
58
+ puts " replace anchor (a) href >#{href}, >#{title}<"
59
+
60
+ ## convert href to xref
61
+ xref = if href.start_with?('#') ## in-page ref
62
+ ", see §#{href[1..-1]}"
63
+ elsif href.start_with?( /https?:/ ) ## external page ref
64
+ ## skip - keep empty - why? why not? (or add url domain?)
65
+ ''
66
+ else
67
+ ## hack - check for some custom excludes
68
+ if title.start_with?( 'Rec.Sport.Soccer' )
69
+ ## skip - keep empty
70
+ ''
71
+ else
72
+ ## strip (ending) .htm|html
73
+ ", see page #{href.sub( /\.html?$/,'')}"
74
+ end
75
+ end
76
+
77
+ "‹#{squish(title)}#{xref}›"
78
+ end
79
+ end
80
+ end
81
+
82
+
83
+
84
+ end # module PageConverter
85
+ end # module Rsssf
@@ -0,0 +1,87 @@
1
+ module Rsssf
2
+ class PageConverter
3
+
4
+
5
+ # <a name="sa">Série A</a>
6
+ # <a name="sd">Série D</a>
7
+
8
+ # <A name=about>
9
+ # <H2>About this document</H2></A>
10
+ # => change to (possible?)
11
+ # <H2><A name=about>About this document</A></H2>
12
+ #
13
+ #
14
+ # <h4><a name="cb">Copa do Brasil</a></h4>
15
+
16
+ ## note - for content use non-greedy to allow
17
+ ## match of tags inside content too
18
+
19
+ A_NAME_OLD_RE = %r{<A [ ]+ NAME [ ]* =
20
+ (?<name>[^>]+?)
21
+ >
22
+ (?<title>.+?)
23
+ </A>
24
+ }imx
25
+
26
+
27
+ A_NAME_RE = %r{<A [ ]+ NAME [ ]* =
28
+ (?<name>[^>]+?)
29
+ >
30
+ }imx
31
+
32
+
33
+ def replace_a_name_old( html )
34
+ ##
35
+ ## remove (named) anchors
36
+ html.gsub( A_NAME_RE ) do |match| ## note: use .+? non-greedy match
37
+ m = Regexp.last_match
38
+ name = m[:name].gsub( /["']/, '' ).strip ## remove ("" or '')
39
+ title = m[:title].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
40
+ match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
41
+ puts " replace anchor (a) name >#{name}<, >#{title}< - >#{match}<"
42
+
43
+
44
+ ##
45
+ ## todo - report WARN if title incl. tags
46
+ ## assumes text only for now - why? why not?
47
+ ## add a name inside heading !!!
48
+ ## do NOT add heading inside a name !!!
49
+
50
+ "#{title} ‹§#{name}›" ## note - use two spaces min (between title & name)
51
+ end
52
+ end
53
+
54
+
55
+
56
+ def replace_a_name( html )
57
+
58
+ ## note - allows <a name=""> without closing </a>
59
+ ## <a name="semi"><H2>Semifinals</H2>
60
+ ## always put anchor on its own line for now
61
+
62
+ ##
63
+ ## remove (named) anchors
64
+ html.gsub( A_NAME_RE ) do |match| ## note: use .+? non-greedy match
65
+ m = Regexp.last_match
66
+
67
+ name = m[:name].gsub( /["']/, '' ).strip ## remove ("" or '')
68
+ match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
69
+ puts " replace anchor (a) name >#{name}< - >#{match}<"
70
+
71
+ ##
72
+ ## todo - report WARN if title incl. tags
73
+ ## assumes text only for now - why? why not?
74
+ ## add a name inside heading !!!
75
+ ## do NOT add heading inside a name !!!
76
+
77
+ "‹§#{name}›" ## note - use two spaces min (between title & name)
78
+ end
79
+ end
80
+
81
+
82
+
83
+
84
+ end # module PageConverter
85
+ end # module Rsssf
86
+
87
+
@@ -0,0 +1,76 @@
1
+ module Rsssf
2
+ class PageConverter
3
+
4
+
5
+ ## note - for h1,h2,h3,h4,h5,h6
6
+ ## use a backref(erence) e.g. \1
7
+ ##
8
+ ## note - include leading and trailing spaces (incl. newlines) !!!
9
+ ##
10
+ ## note - for content inside use non-greedy to allow
11
+ ## match of tags inside content too
12
+ HEADING_RE = %r{ \s*
13
+ <H(?<level>[1-6])>
14
+ (?<title> .+?)
15
+ </H\k<level>>
16
+ \s*
17
+ }imx
18
+
19
+
20
+
21
+ ###
22
+ ## note - MUST be a one a single line (see make heading for more)
23
+ ## e.g. "<h#{tag}>#{text}</h#{tag}>"
24
+ BOLD_OR_UNDERLINE_LINE_HEADING_RE = %r{^
25
+ [ ]*
26
+ <H (?<tag> [BU]) >
27
+ (?<title> .+?)
28
+ </H \k<tag> >
29
+ [ ]*
30
+ $
31
+ }ix
32
+
33
+
34
+ def replace_heading( html )
35
+ html = html.gsub( HEADING_RE ) do |_|
36
+ m = Regexp.last_match
37
+
38
+ level = m[:level].to_i(10)
39
+ title = m[:title]
40
+
41
+ puts " replace heading #{level} (h#{level}) >#{title}<"
42
+
43
+ ## note: make sure to always add two newlines before and after
44
+ "\n\n#{'='*level} #{title}\n\n"
45
+
46
+ end
47
+
48
+ html = html.gsub( BOLD_OR_UNDERLINE_LINE_HEADING_RE ) do |_|
49
+ m = Regexp.last_match
50
+
51
+ tag = m[:tag].downcase
52
+ title = m[:title]
53
+
54
+ ## use heading 5 for b and heading 6 for underline for now
55
+ ## maybe later change to custom ==_ or ==* or such
56
+ ## to mark the heading (sourced via bold/underscore) ???
57
+ level = if tag == 'b'
58
+ 5
59
+ elsif tag == 'u'
60
+ 6
61
+ else
62
+ raise ArgumentError, "b(old)|u(underscore) tag expected; got #{tag}"
63
+ end
64
+
65
+ puts " replace #{tag}-heading #{level} (h#{level}) >#{title}<"
66
+
67
+ ## note: do NOT add any newlines before and after
68
+ "#{'='*level} #{title}"
69
+ end
70
+
71
+ html
72
+ end
73
+
74
+
75
+ end # module PageConverter
76
+ end # module Rsssf
@@ -0,0 +1,25 @@
1
+ module Rsssf
2
+ class PageConverter
3
+
4
+
5
+
6
+
7
+ HR_LINE_ASCII = "\n\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n"
8
+
9
+ def replace_hr( html )
10
+
11
+ html = html.gsub( /\s*<HR>\s*/im ) do |match|
12
+ match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
13
+ puts " replace horizontal rule (hr) - >#{match}<"
14
+ HR_LINE_ASCII ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
15
+ end
16
+
17
+ html
18
+ end
19
+
20
+ end # module PageConverter
21
+ end # module Rsssf
22
+
23
+
24
+
25
+
@@ -0,0 +1,247 @@
1
+ module Rsssf
2
+ class PageConverter
3
+
4
+
5
+
6
+
7
+
8
+
9
+ def squish( str )
10
+ ## squish more than one white space to one space
11
+ str.gsub( /[ \r\t\n]+/, ' ' )
12
+ end
13
+
14
+
15
+
16
+
17
+ def html_to_txt( html, url: )
18
+
19
+ ###
20
+ # todo: check if any tags (still) present??
21
+
22
+
23
+ ## cut off everything before body
24
+ ##
25
+ ## note - might incl. attributes e.g.
26
+ ## <body bgcolor="yellow">
27
+
28
+ ## record / track (important) edits - sub(stitutions) etc.
29
+ edits = []
30
+
31
+
32
+ html = html.sub( /.+?
33
+ <BODY [^>]*? >
34
+ \s*
35
+ /xim,
36
+ '' )
37
+
38
+ ## special case i) no <body> - cut-off head if present
39
+ ## cut off everything before <head/>
40
+ ## used in braz93.html, braz98.html
41
+ html = html.sub( /.+?
42
+ <\/HEAD>
43
+ \s*
44
+ /xim, '' )
45
+
46
+ ## special case ii) no <body>, no </head>
47
+ ## cut off everything before <head/>
48
+ ## used in braz93.html, braz98.html
49
+ html = html.sub( /.+?
50
+ <HEAD\/>
51
+ \s*
52
+ /xim, '' )
53
+
54
+
55
+
56
+
57
+ ## cut off everything after body (closing)
58
+ html = html.sub( /<\/BODY>.*/im, '' )
59
+
60
+ ## special case
61
+ ## cut off everything after </html> (closing)
62
+ ## used in braz93.html, braz98.html
63
+ html = html.sub( /<\/HTML>.*/im, '' )
64
+
65
+
66
+
67
+
68
+ ## quick fix
69
+ ## <title>World Cup 1950 qualifications</title>
70
+ ## <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-2">
71
+
72
+ ## remove title and meta
73
+ html = html.sub( /<TITLE>.*?<\/TITLE>/i, '' )
74
+ html = html.sub( /<META .*?>/i, '' )
75
+
76
+
77
+ ## (auto-)fix known types / errors
78
+ ## todo - pass in/along filename/url too - why? why not?
79
+ html = errata_html( html )
80
+
81
+
82
+
83
+ ##
84
+ ## change ^<b><a name ...></a></b>$ or <hb> - heading "bold" - might be h5
85
+ ## ^<u><a name ...></a></u>$ to <hu> - heading "underscore" - might be h6
86
+ html, more_edits = make_heading( html )
87
+ edits += more_edits
88
+
89
+
90
+
91
+
92
+
93
+
94
+ ## remove cite
95
+ html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
96
+ puts " remove cite >#{$1}<"
97
+ "#{$1}"
98
+ end
99
+
100
+
101
+ html = replace_hr( html )
102
+
103
+
104
+
105
+
106
+ ## replace break (br)
107
+ ## note: do NOT use m/multiline for now - why? why not??
108
+ html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
109
+ match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
110
+ puts " replace break (br) - >#{match}<"
111
+ "\n"
112
+ end
113
+
114
+
115
+
116
+
117
+ html = replace_a_name( html )
118
+
119
+ html = replace_a_href( html )
120
+
121
+ ## quickfix remove trailing </a> left possibly by a_name
122
+ html = html.gsub( /<\/A>/i, '' )
123
+
124
+
125
+
126
+ ## replace paragrah (p)
127
+ html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
128
+ match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
129
+ puts " replace paragraph (p) - >#{match}<"
130
+ "\n\n"
131
+ end
132
+ html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
133
+
134
+
135
+
136
+
137
+ html = replace_heading( html )
138
+
139
+
140
+ ## remove i(talics)
141
+ ## use non-greedy match as default? e.g. .*? - why? why not?
142
+ ## or greedy w/ ([^<]+)
143
+ html = html.gsub( /<I>(.*?)<\/I>/im ) do |_|
144
+ puts " remove italic (i) >#{$1}<"
145
+ "#{$1}"
146
+ end
147
+
148
+ html = html.gsub( /<U>(.*?)<\/U>/im ) do |_|
149
+ puts " remove underline (u) >#{$1}<"
150
+ "#{$1}"
151
+ end
152
+
153
+ ## remove b - note: might include anchors (thus, call after anchors)
154
+ ### use non-greedy match as default? e.g. .*? - why? why not?
155
+ ## was - "**#{$1}**"
156
+ html = html.gsub( /<B>(.*?)<\/B>/im ) do |_|
157
+ puts " remove bold (b) >#{$1}<"
158
+ "#{$1}"
159
+ end
160
+
161
+ ## <strong></strong>
162
+ html = html.gsub( /<STRONG>(.*?)<\/STRONG>/im ) do |_|
163
+ puts " remove strong (strong) >#{$1}<"
164
+ "#{$1}"
165
+ end
166
+
167
+
168
+
169
+ ## replace preformatted (pre)
170
+ html = html.gsub( /<PRE>|<\/PRE>/i ) do |match|
171
+ puts " replace preformatted (pre)"
172
+
173
+ ## note - replace preformatted blocks
174
+ ## with comments
175
+ ## was:
176
+ ## '' # replace w/ nothing for now (keep surrounding newlines)
177
+
178
+ if match.downcase == '<pre>'
179
+ '<!-- start pre -->'
180
+ else
181
+ '<!-- end pre -->'
182
+ end
183
+ end
184
+
185
+
186
+ =begin
187
+ puts
188
+ puts
189
+ puts "html:"
190
+ puts html[0..2000]
191
+ puts "-- snip --"
192
+ puts html[-1000..-1] ## print last hundred chars
193
+ =end
194
+
195
+
196
+ html = remove_emails( html )
197
+
198
+
199
+ html = beautify_anchors( html )
200
+
201
+
202
+ ## check for html tags
203
+ ## left
204
+ ## use scan instead of
205
+ html.gsub( /<
206
+ \/?
207
+ [A-Z]+ [^>]*
208
+ >
209
+ /xim ) do |match|
210
+
211
+ if ['<menu>', '<ul>', '<li>',
212
+ '</menu>', '</ul>', '</li>'].include?(match.downcase)
213
+ ## do nothing
214
+ else
215
+ msg = "found unprocessed html tag #{match} in >#{url}<"
216
+ puts "*** WARN - #{msg}"
217
+ log( msg ) ## log too (see log.txt)
218
+ end
219
+ match
220
+ end
221
+
222
+
223
+ ##
224
+ ## todo/fix
225
+ ## move up-front - kind of preprocessing (not post) - why? why not?
226
+
227
+ ## cleanup whitespaces
228
+ ## todo/fix: convert newline in space first
229
+ ## and than collapse spaces etc.!!!
230
+ txt = String.new
231
+ html.each_line do |line|
232
+ line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
233
+ line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
234
+
235
+ txt << line
236
+ txt << "\n"
237
+ end
238
+
239
+ txt = errata_txt( txt )
240
+
241
+ [txt, edits]
242
+ end # method html_to_text
243
+
244
+
245
+
246
+ end # module PageConverter
247
+ end # module Rsssf
@@ -0,0 +1,20 @@
1
+
2
+ module Rsssf
3
+
4
+ def self.download_page( url, encoding: )
5
+
6
+ ## note: assume plain 7-bit ascii for now
7
+ ## -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1)
8
+ ###-- does NOT use utf-8 character encoding!!!
9
+ response = Webget.page( url, encoding: encoding ) ## fetch (and cache) html page (via HTTP GET)
10
+
11
+ ## note: exit on get / fetch error - do NOT continue for now - why? why not?
12
+ exit 1 if response.status.nok? ## e.g. HTTP status code != 200
13
+
14
+
15
+ puts "html:"
16
+ html = response.text( encoding: encoding )
17
+ pp html[0..400]
18
+ html
19
+ end
20
+ end # module Rsssf