rsssf 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +2 -0
  3. data/Manifest.txt +39 -2
  4. data/README.md +67 -62
  5. data/Rakefile +2 -2
  6. data/config/groups_en.txt +44 -0
  7. data/config/rounds_en.txt +283 -0
  8. data/config/rounds_es.txt +20 -0
  9. data/config/rounds_misc.txt +7 -0
  10. data/lib/_cocos_.rb +158 -0
  11. data/lib/rsssf/convert/convert.rb +71 -0
  12. data/lib/rsssf/convert/errata.rb +103 -0
  13. data/lib/rsssf/convert/html_entities.rb +150 -0
  14. data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
  15. data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
  16. data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
  17. data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
  18. data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
  19. data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
  20. data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
  21. data/lib/rsssf/convert/html_to_txt.rb +247 -0
  22. data/lib/rsssf/download.rb +4 -135
  23. data/lib/rsssf/fmtfix/dates.rb +541 -0
  24. data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
  25. data/lib/rsssf/fmtfix/errata.rb +44 -0
  26. data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
  27. data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
  28. data/lib/rsssf/fmtfix/goals.rb +173 -0
  29. data/lib/rsssf/fmtfix/headers.rb +326 -0
  30. data/lib/rsssf/fmtfix/outline.rb +228 -0
  31. data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
  32. data/lib/rsssf/fmtfix/rounds.rb +74 -0
  33. data/lib/rsssf/fmtfix/score.rb +92 -0
  34. data/lib/rsssf/fmtfix/tables.rb +316 -0
  35. data/lib/rsssf/fmtfix/topscorers.rb +50 -0
  36. data/lib/rsssf/page-find_schedule.rb +127 -0
  37. data/lib/rsssf/page-meta.rb +68 -0
  38. data/lib/rsssf/page.rb +89 -227
  39. data/lib/rsssf/parse_schedules.rb +34 -0
  40. data/lib/rsssf/prepare/convert-links.rb +77 -0
  41. data/lib/rsssf/prepare/convert-meta.rb +111 -0
  42. data/lib/rsssf/prepare/convert-navlines.rb +154 -0
  43. data/lib/rsssf/prepare/convert-postproc.rb +141 -0
  44. data/lib/rsssf/prepare/convert.rb +100 -0
  45. data/lib/rsssf/prepare/download.rb +40 -0
  46. data/lib/rsssf/project.rb +154 -0
  47. data/lib/rsssf/reports/page.rb +40 -8
  48. data/lib/rsssf/reports/schedule.rb +18 -55
  49. data/lib/rsssf/utils.rb +28 -17
  50. data/lib/rsssf/version.rb +5 -2
  51. data/lib/rsssf.rb +53 -13
  52. metadata +50 -9
  53. data/lib/rsssf/convert.rb +0 -495
  54. data/lib/rsssf/repo.rb +0 -144
@@ -0,0 +1,316 @@
1
+ module Rsssf
2
+ class Fmtfix ## todo: find a better name e.g. Format or Fixer or ??
3
+
4
+
5
+
6
+ def self.table_heading_( line )
7
+ ## M W T L GF GA PTS AVGE
8
+ ## =>
9
+ ## (?:
10
+ ## [ ]+ M [ ]+ W [ ]+ T [ ]+ L [ ]+ GF [ ]+ GA [ ]+ PTS [ ]+ AVGE [ ]*
11
+ ## )
12
+
13
+ cols = line.strip.split( /[ ]+/ )
14
+
15
+ "(?: [ ]+ #{cols.join(' [ ]+ ')} [ ]*)"
16
+ end
17
+
18
+
19
+
20
+
21
+
22
+ ##
23
+ ### note - may start with blank line OR
24
+ ## header
25
+ ## followed by optional heading (e.g. M W T L GF GA PTS)
26
+ ## and table lines ( 1. rapid 38 17 ...)
27
+
28
+
29
+
30
+ ##
31
+ ## note simple/compact table standing format needs more thinking
32
+ ## will match
33
+ ## FC Schalke 04 1-3 1. FSV Mainz 05
34
+ ## Hannover 96 3-1 1. FC Nürnberg
35
+ ## FC Schalke 04 0-1 1. FC Kaiserslautern
36
+ ## Hannover 96 2-0 1. FSV Mainz 05
37
+ ## FSV Mainz 05 3-1 1. FC Köln
38
+ ##
39
+ ## - add a required ranking in the beginning e.g. 1., 2. or such?
40
+
41
+ =begin
42
+ | (?: ## or compact/min form -- 22 37-15 51
43
+ ## maybe allow spaces later inbetween 37- 15 - why? why not?
44
+ ## 1. 1. FC Köln 30 17 11 2 78- 40 45
45
+
46
+ [ ]+ \d{1,3} ## played
47
+ [ ]+ \d{1,3} [ ]? -[ ]? \d{1,3} ## gf-ga
48
+ [ ]+ \d{1,3} \b ## pts
49
+ )
50
+ =end
51
+
52
+
53
+ =begin
54
+ ## (i) table header
55
+ ##
56
+ ## fix - make header match more strict!!!
57
+ ## e.g. do NOT match --- or more than three spaces or such
58
+ ## exlcude in header
59
+ ## NB:
60
+ ## [*]
61
+ ## [1]
62
+ ## exclude heading === e.g.
63
+ ## ==== USL Premier Development
64
+ ^
65
+ [ ]*
66
+
67
+
68
+ ## exclude comma (,) - why? why not?
69
+ ## and numbers - unless group 1
70
+ ## e.g. Kaczor 78 - Dreßel 19, Steinkogler 50,
71
+ ## B'schweig 2-1 Schalke (Handschuh 38, Popivoda 55 - Fischer 82)
72
+ ## M'gladbach 2-1 1. FC Köln (Jensen 6, Wittkamp 35 - D.Müller 78)
73
+ ## Kraft 3, E.Kremers 38)
74
+ ## Schalke 4-0 Tasmania (Klose 2, 78, Herrmann 40, Kreuz 82)
75
+ ##
76
+ ## allow name such as
77
+ ## USL - 1ST DIVISION (2nd Division)
78
+
79
+
80
+ (?<header> [^=*:,0-9\[\]\n]+?
81
+ ([ ] \d{1,2} \b)? ## optional number only at the end e.g. group 1
82
+ )
83
+ :? ## optional colon (:) e.g. final table:
84
+ =end
85
+
86
+
87
+ TABLE_HEADER_RE = %r{
88
+ ############
89
+ ## negative & positive lookaheads
90
+
91
+ ## (?!
92
+ ## .* [ ]{2,} ## no (inline) double (or more) spaces allowed
93
+ ## )
94
+
95
+ (?:
96
+ ## (i) can only start with non-zero number
97
+ ## or alpha
98
+ ##
99
+ ## A. or
100
+ ## 1. or
101
+ ## mixed with dot 1A. yes/no?, A1. yes/no?, 1B1. ?
102
+ ## 1.K - 1.Klasse
103
+
104
+ ##
105
+ ## note - \b(oundary) - to always get complete tokens (alphanum) tokens
106
+ ## note - \b includes [a-z0-9_] PLUS underscore (_)
107
+ ## check if underscore is \b
108
+ ## e.g. 09_ or _09 or match \b[0-9]\b ???
109
+ ## use our own asserts?
110
+ ## BNUM (boundary number) e.g. [^0-9]
111
+ ## BALPHA (boundary alpha) e.g. [^a-z]
112
+ ## BALNUM (boundary alphanum) e.g. [^a-z0-9]
113
+ ## classic is [^a-z0-9_]
114
+
115
+ (?<header>
116
+ (?=
117
+ .* \p{L}+ ## must incl. alpha character - not only numbers!!
118
+ )
119
+
120
+ ## note
121
+ ## order matters
122
+ ## move specific first!!
123
+
124
+ \b
125
+ (?: [0-9]+\p{L} [0-9\p{L}]* \b ## (ii) mixed alphanum (starting w/ num)
126
+ | [0-9]+ \b \.? (?! \d) ## (i) num
127
+ | \p{L}+[0-9] [0-9\p{L}]* \b ## (iiii) mixed alphanum (starting w/ alpha)
128
+ | \p{L}+ \b \.? ## (iii) alpha
129
+ )
130
+ (?:
131
+ ## " (i-iiii) connector options (a) single space
132
+ ## -- exclude numbers on numbers (FIX)
133
+ ## (b) dash (-) or slash (/)
134
+ ## -- must be alpha(.?)-alpha
135
+ ## incl. K.-H. with trailing dot
136
+ ## add ampersand (&) too
137
+ ## w/ leading & trailing opt space?
138
+ ## incl. K.&H., K. & H.
139
+ (?: [ ]?
140
+ | (?<! \d) - ## add negative lookbehind&ahead (no numbers please)
141
+ (?! \d)
142
+ | /
143
+ )
144
+ ## repeat (i-iiii) see above
145
+ ## todo - do NOT allow numbers followed by numbers
146
+ \b
147
+ (?: [0-9]+ \b (?! [ ] \d) ## (i) num - no more ordinals - why? why not?
148
+ | [0-9]+\p{L} [0-9\p{L}]* \b ## (ii) mixed alphanum (starting w/ num)
149
+ ## group 1a 1FC?? - why? why not?
150
+ | \p{L}+ \b \.? ## (iii) alpha
151
+ | \p{L}+[0-9] [0-9\p{L}]* \b ## (iiii) mixed alphanum (starting w/ alpha)
152
+ )
153
+ )*
154
+ (?:
155
+ [ ]
156
+ \( [^:()\[\]]+? \)
157
+ )?
158
+ ) ## end-of-capture header
159
+ )
160
+ :? ## optional colon (:) e.g. final table:
161
+ }ix
162
+
163
+
164
+ TABLE_RE = %r{
165
+
166
+ ### optional table header
167
+ (?:
168
+ ### negative lookahead
169
+ ## MUST NOT match standing line e.g. 10 3 4
170
+ ## or table heading (see below)
171
+ ## or ----- (old style structured heading left overs)
172
+ (?! ^[ ]* (?: [^\n]+? [ ]+ \d{1,3} [ ]+ \d{1,3} [ ]+ \d{1,3}
173
+ | (?: GP | M | Team ) [ ]
174
+ | -{3,}
175
+ )
176
+ )
177
+
178
+ ## (i) table header
179
+ ##
180
+ ## fix - make header match more strict!!!
181
+ ## e.g. do NOT match --- or more than three spaces or such
182
+ ## exlcude in header
183
+ ## NB:
184
+ ## [*]
185
+ ## [1]
186
+ ## exclude heading === e.g.
187
+ ## ==== USL Premier Development
188
+ ^
189
+ [ ]*
190
+
191
+
192
+ ## exclude comma (,) - why? why not?
193
+ ## and numbers - unless group 1
194
+ ## e.g. Kaczor 78 - Dreßel 19, Steinkogler 50,
195
+ ## B'schweig 2-1 Schalke (Handschuh 38, Popivoda 55 - Fischer 82)
196
+ ## M'gladbach 2-1 1. FC Köln (Jensen 6, Wittkamp 35 - D.Müller 78)
197
+ ## Kraft 3, E.Kremers 38)
198
+ ## Schalke 4-0 Tasmania (Klose 2, 78, Herrmann 40, Kreuz 82)
199
+ ##
200
+ ## allow name such as
201
+ ## USL - 1ST DIVISION (2nd Division)
202
+
203
+
204
+ (?<header> [^=*:,0-9\[\]\n]+?
205
+ ([ ] \d{1,2} \b)? ## optional number only at the end e.g. group 1
206
+ )
207
+ :? ## optional colon (:) e.g. final table:
208
+ ## cut-off everything separated by more than three spaces
209
+ ## e.g. might be "inline" table heading (follow table header name)
210
+ ## e.g. Group 1 M W T L GF GA DIF PTS
211
+ (?: [ ]{4,} (?: GP | M |Team ) [ ] [^\n]+? )?
212
+ [ ]*
213
+ ## note - allow optional blank line - why? why not?
214
+ (?: \n ^[ ]* )?
215
+ \n
216
+ )?
217
+
218
+
219
+ #### optional table heading line
220
+ (?: ^(?:
221
+ #{table_heading_( 'GP W L D GF GA PTS?' )}
222
+ | #{table_heading_( 'GP W L T GF GA PTS?' )}
223
+ | #{table_heading_( 'GP W T L GF GA PTS?' )}
224
+ | #{table_heading_( 'GP W D L GF GA PTS?' )}
225
+ ## SW sudden death win, SL sudden death lose
226
+ | #{table_heading_( 'GP W L SW GF GA PTS?' )}
227
+ | #{table_heading_( 'GP W SW SL L GF GA PTS?' )}
228
+ | #{table_heading_( 'GP W SOW SOL L GF GA PTS?' )}
229
+ ## mx/spanish
230
+ | #{table_heading_( 'M W T L GF GC DIF PTS' )}
231
+ | #{table_heading_( 'M W T L GF GA PTS AVGE' )}
232
+ | #{table_heading_( 'Team M W T L GF-GA PTS')}
233
+ | #{table_heading_( 'Team M W T L GF-GA PTS EP TP')}
234
+ )
235
+ ## note - allow optional blank line - why? why not?
236
+ (?: \n ^[ ]* )?
237
+ \n
238
+ )?
239
+
240
+
241
+ ## MUST be followed by a table (standing) line
242
+ ## e.g. 1.FC Cincinnati 34 20 9 5 57-39 69
243
+ ##
244
+ ## note - allow "run-on" e.g. LB14 on first number
245
+ ## Hudson Valley Quickstrike LB14 12 0 2 40 9 38
246
+ ## Hudson Valley Quickstrike LB12 11 1 0 26 9 33
247
+ ##
248
+ ## 17 11 5 1 40 16 +24 38
249
+ ## or
250
+ ### + 1.DC United 32 17 6/ 3 6 65-43 57
251
+
252
+ ^
253
+ (?:
254
+ [^\n]+?
255
+ (?:
256
+ (?:
257
+
258
+ \d{1,3}
259
+ [ ]+ \d{1,3} ## win
260
+ (?: [ ]+ | [ ]* / [ ]* ) \d{1,3} ## draw
261
+ [ ]+ \d{1,3} ## lose
262
+ [ ]+ \d{1,3} (?: [ ]* [:-] [ ]*
263
+ | [ ]+ ) \d{1,3}
264
+ [ ]+ [+-]? \d{1,3} \b # might be diff or point allow +/-!!
265
+ )
266
+ )
267
+ [^\n]*?
268
+ )
269
+ \n
270
+
271
+ ## eat-up the rest
272
+ .*? ## non-greedy - match everything (incl. newline!) until
273
+ (?: \n (?= \n) ## break on blank line (\n\n) or end-of-string/file
274
+ | \z
275
+ )
276
+
277
+ }ixm
278
+
279
+
280
+
281
+
282
+ def handle_tables( txt, tables: [] )
283
+
284
+
285
+ txt = txt.gsub( TABLE_RE ) do |match|
286
+
287
+ m = Regexp.last_match
288
+
289
+ puts " proc table >#{m[:header]}< block:"
290
+ puts ">>> (begin)"
291
+ puts match
292
+ puts "<<< (end)"
293
+
294
+ ## remove everyting
295
+ ## or put in comment block later with command line option/switch!!
296
+ ## ''
297
+
298
+ ## replace with "collapsed" marker
299
+
300
+
301
+
302
+ tables << match
303
+ table_id = tables.size
304
+ if m[:header] ## note - header might be missing
305
+ ## table starting w/ blank line
306
+ "<!-- $table#{table_id}$ - #{m[:header]} -->\n"
307
+ else
308
+ "<!-- $table#{table_id}$ -->\n"
309
+ end
310
+ end
311
+ txt
312
+ end
313
+
314
+
315
+ end ## class Fmtfix
316
+ end ## module Rsssf
@@ -0,0 +1,50 @@
1
+ module Rsssf
2
+ class Fmtfix ## todo: find a better name e.g. Format or Fixer or ??
3
+
4
+ ##
5
+ ## process/handle Topscoreres: ... to first blank line (\n\n)
6
+
7
+
8
+ ## e.g.
9
+ ## topscorer, topscorers
10
+ ## top scorer, top scorers
11
+ ## scorer, scorers
12
+
13
+ TOPSCORERS_RE = %r{^ [ ]*
14
+ (?<header>
15
+ (?: top [ ]?)? ## note - optional top
16
+ scorers? ## singular or plural
17
+ )
18
+ (?: [ ]* :)? ## note - optional colon
19
+ [ ]*
20
+ \n{0,2} ## note - optional leading blank line!!
21
+
22
+ .*? ## non-greedy - match everything until
23
+ (?: \n (?= \n) ## blank line (\n\n) or end-of-string/file
24
+ | \z
25
+ )
26
+ }ixm
27
+
28
+
29
+ def handle_topscorers( txt, topscorers: [], opts: {} )
30
+ txt = txt.gsub( TOPSCORERS_RE ) do |match|
31
+ if opts[:topscorers]
32
+ puts " proc topscorers block:"
33
+ puts match
34
+ end
35
+
36
+ ## remove everyting
37
+ ## or put in comment block later with command line option/switch!!
38
+ ## ''
39
+
40
+ ## replace with "collapsed" marker
41
+ topscorers << match
42
+ topscorers_id = topscorers.size
43
+ "<!-- $topscorers#{topscorers_id}$ -->\n\n"
44
+ end
45
+ txt
46
+ end
47
+
48
+
49
+ end ## class Fmtfix
50
+ end ## module Rsssf
@@ -0,0 +1,127 @@
1
+ module Rsssf
2
+ class Page
3
+
4
+
5
+ ###
6
+ # (experimental)
7
+ # machinery to split document by leagues & cups
8
+
9
+
10
+ ##
11
+ ## for now simply split
12
+ ## on headings
13
+
14
+
15
+ ### fix - support match with trailing ==== too!!!
16
+
17
+ ### note - starts at
18
+ HEADER_RE = %r{ ## negative lookahead
19
+ ## do NOT match =-=
20
+ ## do NOT match =========== (without any heading text!!)
21
+ ## e.g.
22
+ ## Fall season
23
+ ## ===========
24
+
25
+ (?! ^[ ]* (?: =-=
26
+ | ={1,} [ ]* $
27
+ )
28
+ )
29
+
30
+ ^
31
+ [ ]*
32
+ (?<marker> ={1,6})
33
+ [ ]*
34
+ (?<text> .+?)
35
+ #{OPT_REF}
36
+ [ ]*
37
+ $}x
38
+
39
+
40
+
41
+
42
+ def _split_sections( txt, level: 2 )
43
+
44
+ sections = {}
45
+ current = nil
46
+
47
+ txt.each_line do |line|
48
+ if m=HEADER_RE.match( line )
49
+ header_level = m[:marker].size
50
+ header_text = m[:text]
51
+ if header_level == level
52
+ current = String.new
53
+ sections[ header_text ] = current
54
+ next
55
+ end
56
+ end
57
+
58
+ current << line if current
59
+ end
60
+
61
+ sections
62
+ end
63
+
64
+
65
+
66
+
67
+
68
+ ## make header required - yes
69
+ ## change to build_schedule - why? why not???
70
+ ## add level: 2 or such - why? why not?
71
+ def find_schedule!( header: )
72
+ _find_schedule( header: header, strict: true )
73
+ end
74
+
75
+
76
+ def _find_schedule( header:, strict: false )
77
+ ## make sure header is an array
78
+ header = [header] if header.is_a?( String )
79
+
80
+ txt = _walk_sections( @txt, header: header,
81
+ depth: 0,
82
+ strict: strict )
83
+
84
+ if txt
85
+ ## wrap in schedule class - why? why not?
86
+ schedule = Schedule.new( txt )
87
+ schedule
88
+ else
89
+ nil
90
+ end
91
+ end
92
+
93
+
94
+ def _walk_sections( txt, header:,
95
+ depth:,
96
+ strict: false )
97
+
98
+ query = header[depth]
99
+ query_next = header[depth+1]
100
+
101
+ ## note - start at level 2
102
+ sections = _split_sections( txt, level: depth+2 )
103
+
104
+ txt = sections[ query ]
105
+ if txt
106
+ if query_next
107
+ txt = _walk_sections( txt, header: header,
108
+ depth: depth+1,
109
+ strict: strict )
110
+ txt
111
+ else
112
+ txt
113
+ end
114
+ else
115
+ if strict
116
+ ## note - return nil if not found!!!
117
+ raise ArgumentError, "section with header >#{query}< not found; sections incl. #{sections.keys}"
118
+ else
119
+ nil
120
+ end
121
+ end
122
+ end # method _find_schedule
123
+
124
+
125
+
126
+ end # class Page
127
+ end # module Rsssf
@@ -0,0 +1,68 @@
1
+
2
+
3
+ module Rsssf
4
+ class Page
5
+
6
+
7
+ ##
8
+ ## note - \A - start of string
9
+ ## comment must start .txt document!!!
10
+
11
+ HTML_COMMENT_HEADER_RE = %r{ \A
12
+ [ \n]* ## trailing spaces and blank lines
13
+ <!--
14
+ [ \n]*
15
+ (?<text> .+?)
16
+ [ \n]*
17
+ -->
18
+ }imx
19
+
20
+
21
+
22
+
23
+ ###
24
+ ## find meta data block (via html-style comment header )
25
+ ## incl. title, autor(s), url, updated
26
+ ## e.g.
27
+ ## <!--
28
+ ## title: Austria 2024/25
29
+ ## source: https://rsssf.org/tableso/oost2025.html
30
+ ## author: Hans Schöggl
31
+ ## updated: 7 Jul 2025
32
+ ## -->
33
+ ## -or-
34
+ ## authors: Hans Schöggl and Karel Stokkermans
35
+
36
+
37
+
38
+
39
+
40
+ def self.parse_meta( txt )
41
+ meta = {}
42
+ m = HTML_COMMENT_HEADER_RE.match( txt )
43
+ if m
44
+ text = m[:text]
45
+ text.each_line do |line|
46
+ line = line.strip
47
+
48
+ ## note - allow "inline" blank lines and comment lines (starting w/ #)
49
+ next if line.empty? || line.start_with?('#')
50
+
51
+ ## split line on first colon (:) (only)
52
+ ## note - limit split to two pieces!!!
53
+ key, value = line.split( /[ ]*:[ ]*/, 2)
54
+ ## use a symbol (not string) as key - why? why not?
55
+ meta[ key.to_sym ] = value
56
+ end
57
+ meta
58
+ else
59
+ nil ## no meta data (comment header) found
60
+ end
61
+ end
62
+ def parse_meta( txt ) self.class.parse_meta( txt ); end
63
+
64
+
65
+
66
+
67
+ end # class Page
68
+ end # module Rsssf