rsssf-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,102 @@
1
+
2
+
3
+ module Rsssf
4
+ class Parser
5
+
6
+ ## Group A-Z
7
+ ## Group 1-99
8
+ ## Group HEX # used in concaf world cup quali
9
+ ## Group 1A or A1, B1 - used anywhere
10
+ ##
11
+ ## use "key" of group - why? why not?
12
+
13
+ GROUP_RE = %r{(?<group>
14
+ \b
15
+ Group [ ]
16
+ [a-z0-9]+
17
+ \b)}ix
18
+
19
+
20
+ ROUND_RE = %r{(?<round>
21
+ \b
22
+ (?:
23
+ # round - note - requiers number e.g. round 1,2, etc.
24
+ (?: (?: Round |
25
+ Matchday |
26
+ Week
27
+ )
28
+ [ ] [0-9]+
29
+ )
30
+ |
31
+ # more (kockout) rounds
32
+ # playoffs - playoff, play-off, play-offs
33
+ (?: Play-?offs?
34
+ (?: [ ]for[ ]quarter-?finals )?
35
+ )
36
+ |
37
+ # round32
38
+ (?: Round[ ]of[ ]32 |
39
+ Last[ ]32 |
40
+ 16th[ ]finals |
41
+ 1/16[ ]finals
42
+ )
43
+ |
44
+ # round16
45
+ (?: Round[ ]of[ ]16 |
46
+ Last[ ]16 |
47
+ 8th[ ]finals |
48
+ 1/8[ ]finals
49
+ )
50
+ |
51
+ # fifthplace
52
+ (?:
53
+ (?: (Fifth|5th)[ -]place
54
+ (?: [ ] (?: match|play-?off|final ))?
55
+ ) |
56
+ (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
57
+ )
58
+ |
59
+ # thirdplace
60
+ (?:
61
+ (?: (Third|3rd)[ -]place
62
+ (?: [ ] (?: match|play-?off|final ))?
63
+ ) |
64
+ (?: Match[ ]for[ ](?: third|3rd )[ -]place )
65
+ )
66
+ |
67
+ # quarterfinals
68
+ (?:
69
+ Quarter-?finals? |
70
+ Quarters |
71
+ Last[ ]8
72
+ )
73
+ |
74
+ # semifinals
75
+ (?:
76
+ Semi-?finals? |
77
+ Semis |
78
+ Last[ ]4
79
+ )
80
+ |
81
+ # final
82
+ Finals?
83
+ )
84
+ \b)}ix
85
+
86
+ ##
87
+ ## keep leg separate (from round) - why? why not?
88
+ ##
89
+ LEG_RE = %r{ (?<leg>
90
+ \b
91
+ (?:
92
+ # leg1
93
+ (?: 1st|First)[ ]legs?
94
+ |
95
+ # leg2
96
+ (?: 2nd|Second)[ ]legs?
97
+ )
98
+ \b)}ix
99
+
100
+
101
+ end # class Parser
102
+ end # module Rsssf
@@ -0,0 +1,103 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ######
6
+ ## e.g. 2-1
7
+ SCORE_RE = %r{
8
+ (?<score>
9
+ (?<=[ ]) # Positive lookbehind for space
10
+ (?<score1>\d{1,2}) - (?<score2>\d{1,2})
11
+ (?=[ ]) # positive lookahead for space
12
+ )
13
+ }ix
14
+
15
+ ## [aet]
16
+ ## [aet, 3-2 pen]
17
+ ## [aet; 3-2 pen]
18
+ ## [3-2 pen]
19
+ ## [3-2 pen.]
20
+ ## [aet, 9-8 pen]
21
+ ## [aet, 5-3 pen]
22
+ ## [aet, 6-5 pen]
23
+ ## [aet]
24
+ ##
25
+ ## - add dot (.) too ??
26
+ ## [aet. 3-2 pen]
27
+
28
+
29
+ SCORE_EXT_RE = %r{ \[
30
+ (?<score_ext>
31
+ (?: ## aet only e.g. aet
32
+ aet
33
+ (?: ## optional pen
34
+ [,;][ ]*
35
+ \d{1,2}-\d{1,2} [ ]? pen\.?
36
+ )?
37
+ )
38
+ |
39
+ (?: ## penalty only e.g. 3-2 pen
40
+ \d{1,2}-\d{1,2} [ ]? pen\.?
41
+ )
42
+ )
43
+ \]
44
+ }ix
45
+
46
+ ### awd - awarded
47
+ SCORE_AWD_RE = %r{ ## must be space before and after!!!
48
+ (?<score_awd>
49
+ (?<=[ ]) # Positive lookbehind for space
50
+ awd
51
+ (?=[ ]) # positive lookahead for space
52
+ )
53
+ }ix
54
+
55
+ ### abd - abandoned
56
+ SCORE_ABD_RE = %r{ ## must be space before and after!!!
57
+ (?<score_abd>
58
+ (?<=[ ]) # Positive lookbehind for space
59
+ abd
60
+ (?=[ ]) # positive lookahead for space
61
+ )
62
+ }ix
63
+
64
+ ### ppd - postponed
65
+ SCORE_PPD_RE = %r{ ## must be space before and after!!!
66
+ (?<score_ppd>
67
+ (?<=[ ]) # Positive lookbehind for space
68
+ ppd
69
+ (?=[ ]) # positive lookahead for space
70
+ )
71
+ }ix
72
+
73
+
74
+ ### n/p - not played
75
+ SCORE_NP_RE = %r{ ## must be space before and after!!!
76
+ (?<score_np>
77
+ (?<=[ ]) # Positive lookbehind for space
78
+ n/p
79
+ (?=[ ]) # positive lookahead for space
80
+ )
81
+ }ix
82
+
83
+ ## A walkover, also W.O. or w/o (originally two words: "walk over"),
84
+ ## is awarded to the opposing team/player etc,
85
+ ## if there are no other players available,
86
+ ## or they have been disqualified,
87
+ ## because the other contestants have forfeited or
88
+ # the other contestants have withdrawn from the contest.
89
+ ##
90
+ ## w/o - walk over
91
+ SCORE_WO_RE = %r{ ## must be space before and after!!!
92
+ (?<score_wo>
93
+ (?<=[ ]) # Positive lookbehind for space
94
+ w/o
95
+ (?=[ ]) # positive lookahead for space
96
+ )
97
+ }ix
98
+
99
+
100
+
101
+ end # class Parser
102
+ end # module Rsssf
103
+
@@ -0,0 +1,162 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ## note - do NOT allow single alpha text for now
6
+ ## add later?? A - B C - D - why?
7
+ ## opt 1) one alpha
8
+ ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
+
10
+ ## opt 2) more than one alphanum
11
+
12
+
13
+ ### allow special case - starting text with number e.g.
14
+ ## number must be follow by space or dot ()
15
+ # 1 FC ## allow 1-FC or 1FC - why? why not?
16
+ # 1. FC
17
+ # 1.FC - XXXX - not allowed for now, parse error
18
+ # 1FC - XXXX - now allowed for now, parse error
19
+ # 1890 Munich
20
+ #
21
+
22
+
23
+ ##
24
+ # allow Cote'd Ivoir or such
25
+ ## e.g. add '
26
+
27
+
28
+ ## note - use a more strict text re(gex)
29
+ ## if inside brackets !!!!
30
+
31
+ ###
32
+ ## "simple" strict text regex
33
+ ### no numbers (or & or such inside)
34
+ ## allows dash/hyphen (-)
35
+ ## and dot (.) and apostroph (') for now
36
+
37
+
38
+ ## simple (double) quoted text
39
+ ## only supports a-z (unicode) PLUS (single) inline space
40
+ ## add more chars - why? why not?
41
+ TEXT_QUOTED = '(?: " ' +
42
+ ' \p{L}+ ' +
43
+ ' (?: [ ] ' +
44
+ ' \p{L}+ )* ' +
45
+ ' " ) '
46
+
47
+
48
+ ### might start with "" !!!
49
+ ## e.g.
50
+ ## "Tiago" Cardoso Mendes 80
51
+ ## "Cristiano Ronaldo" dos Santos Aveiro 74
52
+ ## "Zé Castro" José Eduardo Rosa Vale Castro 60og
53
+
54
+
55
+ TEXT_STRICT_RE = %r{
56
+ (?<text>
57
+ (?: \b | #{TEXT_QUOTED} [ ] ## note - leading quoted text must be followed by space!!
58
+ )
59
+ \p{L}+ ## all unicode letters (e.g. [a-z])
60
+
61
+ (?:
62
+ (?:[ ]
63
+ | # only single spaces allowed inline!!!
64
+ [-]
65
+ )?
66
+ (?:
67
+ \p{L}+ |
68
+ ['.] |
69
+ (?:
70
+ (?<= [ ])
71
+ #{TEXT_QUOTED}
72
+ (?= [ ]|$) ### must be followed by space
73
+ ## todo/fix - add all end of text lookaheads to (see below)
74
+ )
75
+ )
76
+ )*
77
+ ## must NOT end with space or dash(-)
78
+ ## todo/fix - possible in regex here
79
+ ## only end in alphanum a-z0-9 (not dot or & ???)
80
+
81
+ ## positive lookahead
82
+ ## cannot use \b if text ends in dot (.) or other non-alphnum
83
+ ## than \b will not work
84
+ (?=[ ,;@|\[\]]
85
+ |$
86
+ )
87
+ )
88
+ }ix
89
+
90
+
91
+
92
+ TEXT_RE = %r{
93
+ ## must start with alpha (allow unicode letters!!)
94
+ (?<text>
95
+ \b ## use/require word boundary
96
+ (?:
97
+ # opt 1 - start with alpha
98
+ \p{L}+ ## all unicode letters (e.g. [a-z])
99
+ |
100
+
101
+ # opt 2 - start with num!! - allow special case (e.g. 1. FC)
102
+ \d+ # check for num lookahead (MUST be space or dot)
103
+ ## MUST be followed by (optional dot) and
104
+ ## required space !!!
105
+ ## MUST be follow by a to z!!!!
106
+ \.? ## optional dot
107
+ [ ]? ## make space optional too - why? why not?
108
+ ## yes - eg. 1st, 2nd, 5th etc.
109
+ \p{L}+
110
+ )
111
+
112
+ (?:(?: (?:[ ]
113
+ (?! (awd|abd|ppd|n/p|w/o)[ ]) ## note - exclude (awd[ ]/abd[ ]/n/p[ ])
114
+ )
115
+ | # only single spaces allowed inline!!!
116
+ [-]
117
+ )?
118
+ (?:
119
+ \p{L}+ | [&/'.]
120
+ |
121
+ (?:
122
+ \d+
123
+ (?![0-9.:'/+-])
124
+ ## negative lookahead for numbers
125
+ ## note - include digits itself!!!
126
+ )
127
+ )
128
+ )* ## must NOT end with space or dash(-)
129
+ ## todo/fix - possible in regex here
130
+ ## only end in alphanum a-z0-9 (not dot or & ???)
131
+
132
+ ## support (Hamburg) or such at the end (ony)
133
+ ## note - no numbers allowed inside () for now!!
134
+ (?:
135
+ [ ]\(\p{L}+
136
+ (?:
137
+ (?: [ ] |
138
+ [-]
139
+ )?
140
+ \p{L}+ | [&/'.]
141
+ )*
142
+ \)
143
+ )?
144
+
145
+
146
+ ## add lookahead/lookbehind
147
+ ## must be space!!!
148
+ ## (or comma or start/end of string)
149
+ ## kind of \b !!!
150
+ ## positive lookahead
151
+ ## note - added : too - why? why not?
152
+ (?=[ ,;@|:\[\]]
153
+ |$
154
+ )
155
+ )
156
+ }ix
157
+
158
+
159
+
160
+ end # class Parser
161
+ end # module Rsssf
162
+
@@ -0,0 +1,230 @@
1
+
2
+
3
+ module Rsssf
4
+ class Parser
5
+
6
+
7
+
8
+ BASICS_RE = %r{
9
+ (?<spaces> [ ]{2,}) |
10
+ (?<space> [ ])
11
+ |
12
+ (?<sym>[;,@|\[\]])
13
+ }ix
14
+
15
+
16
+
17
+ VS_RE = %r{ ## must be space before and after!!!
18
+ (?<vs>
19
+ (?<=[ ]) # Positive lookbehind for space
20
+ -
21
+ (?=[ ]) # positive lookahead for space
22
+ )
23
+ }ix
24
+
25
+
26
+
27
+
28
+
29
+ RE = Regexp.union( GROUP_RE, ROUND_RE, LEG_RE,
30
+ DATE_RE,
31
+ VS_RE,
32
+ SCORE_RE,
33
+ SCORE_AWD_RE, SCORE_ABD_RE, SCORE_PPD_RE, SCORE_NP_RE,
34
+ SCORE_WO_RE,
35
+ SCORE_EXT_RE,
36
+ NOTE_RE,
37
+ BASICS_RE,
38
+ TEXT_RE )
39
+
40
+
41
+ ## "strict" text match mode inside brackets
42
+ ## ]
43
+ INSIDE_RE = Regexp.union( GOAL_OG_RE, GOAL_PEN_RE,
44
+ BASICS_RE,
45
+ TEXT_STRICT_RE,
46
+ MINUTE_RE,
47
+ )
48
+
49
+ def log( msg )
50
+ ## append msg to ./logs.txt
51
+ ## use ./errors.txt - why? why not?
52
+ File.open( './logs.txt', 'a:utf-8' ) do |f|
53
+ f.write( msg )
54
+ f.write( "\n" )
55
+ end
56
+ end
57
+
58
+
59
+
60
+ def tokenize_with_errors( line, debug: false )
61
+ tokens = []
62
+ errors = [] ## keep a list of errors - why? why not?
63
+
64
+ puts ">#{line}<" if debug
65
+
66
+ pos = 0
67
+ ## track last offsets - to report error on no match
68
+ ## or no match in end of string
69
+ offsets = [0,0]
70
+ m = nil
71
+
72
+ ####
73
+ ## quick hack - keep re state/mode between tokenize calls!!!
74
+ @re ||= RE ## note - switch between RE & INSIDE_RE
75
+
76
+
77
+ while m = @re.match( line, pos )
78
+ if debug
79
+ pp m
80
+ puts "pos: #{pos}"
81
+ end
82
+ offsets = [m.begin(0), m.end(0)]
83
+
84
+ if offsets[0] != pos
85
+ ## match NOT starting at start/begin position!!!
86
+ ## report parse error!!!
87
+ msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
88
+ puts msg
89
+
90
+ errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
91
+ log( msg )
92
+ end
93
+
94
+ ##
95
+ ## todo/fix - also check if possible
96
+ ## if no match but not yet end off string!!!!
97
+ ## report skipped text run too!!!
98
+
99
+ pos = offsets[1]
100
+
101
+ pp offsets if debug
102
+
103
+ t = if @re == INSIDE_RE
104
+ if m[:space]
105
+ nil ## skip space
106
+ elsif m[:spaces]
107
+ nil ## skip spaces
108
+ elsif m[:text]
109
+ [:text, m[:text]] ## keep pos - why? why not?
110
+ elsif m[:minute]
111
+ [:minute, m[:minute]]
112
+ elsif m[:og]
113
+ [:og, m[:og]] ## for typed drop - string version/variants
114
+ elsif m[:pen]
115
+ [:pen, m[:pen]]
116
+ elsif m[:sym]
117
+ sym = m[:sym]
118
+ ## return symbols "inline" as is - why? why not?
119
+ case sym
120
+ when ',' then [:',']
121
+ when ';' then [:';']
122
+ when '@' then [:'@']
123
+ when '|' then [:'|']
124
+ when '['
125
+ ## report error - already in inside mode!!!
126
+ nil
127
+ when ']'
128
+ puts " leave inside match mode"
129
+ @re = RE
130
+ nil
131
+ else
132
+ nil ## ignore others (e.g. brackets [])
133
+ end
134
+ else
135
+ ## report error - why? why not?
136
+ nil
137
+ end
138
+ else ## assume standard mode/ctx
139
+ if m[:space]
140
+ nil ## skip space
141
+ elsif m[:spaces]
142
+ nil ## skip spaces
143
+ elsif m[:text]
144
+ [:text, m[:text]] ## keep pos - why? why not?
145
+ elsif m[:note]
146
+ [:note, m[:note]]
147
+ elsif m[:group]
148
+ [:group, m[:group]]
149
+ elsif m[:round]
150
+ [:round, m[:round]]
151
+ elsif m[:leg]
152
+ [:leg, m[:leg]]
153
+ elsif m[:date]
154
+ [:date, m[:date]]
155
+ elsif m[:vs]
156
+ [:vs, m[:vs]]
157
+ elsif m[:score]
158
+ [:score, m[:score]]
159
+ elsif m[:score_awd] # awarded (awd)
160
+ [:score_awd, m[:score_awd]]
161
+ elsif m[:score_abd] # abandoned (abd)
162
+ [:score_abd, m[:score_abd]]
163
+ elsif m[:score_ppd] # postponed (ppd)
164
+ [:score_ppd, m[:score_ppd]]
165
+ elsif m[:score_np] # not played (n/p)
166
+ [:score_np, m[:score_np]]
167
+ elsif m[:score_wo] # walk over (w/o)
168
+ [:score_wo, m[:score_wo]]
169
+ elsif m[:score_ext]
170
+ [:score_ext, m[:score_ext]]
171
+ elsif m[:sym]
172
+ sym = m[:sym]
173
+ ## return symbols "inline" as is - why? why not?
174
+ case sym
175
+ when ',' then [:',']
176
+ when ';' then [:';']
177
+ when '@' then [:'@']
178
+ when '|' then [:'|']
179
+ when '['
180
+ ## switch to inside mode!!!
181
+ puts " enter inside match mode"
182
+ @re = INSIDE_RE
183
+ nil
184
+ when ']'
185
+ ## already in standard mode/ctx
186
+ ## report warn/error - why? why not?
187
+ nil
188
+ else
189
+ nil ## ignore others (e.g. brackets [])
190
+ end
191
+ else
192
+ ## report error - why? why not?
193
+ nil
194
+ end
195
+ end
196
+
197
+
198
+ tokens << t if t
199
+
200
+ if debug
201
+ print ">"
202
+ print "*" * pos
203
+ puts "#{line[pos..-1]}<"
204
+ end
205
+ end
206
+
207
+ ## check if no match in end of string
208
+ if offsets[1] != line.size
209
+ msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
210
+ puts msg
211
+ log( msg )
212
+
213
+ errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
214
+ end
215
+
216
+
217
+ [tokens,errors]
218
+ end
219
+
220
+
221
+ ### convience helper - ignore errors by default
222
+ def tokenize( line, debug: false )
223
+ tokens, _ = tokenize_with_errors( line, debug: debug )
224
+ tokens
225
+ end
226
+
227
+
228
+ end # class Parser
229
+ end # module Rsssf
230
+
@@ -0,0 +1,21 @@
1
+
2
+ ####
3
+ ## build on "standard" parse
4
+ require 'sportdb/parser'
5
+
6
+
7
+ ## our own code
8
+ require_relative 'parser/token-text'
9
+ require_relative 'parser/token-note'
10
+ require_relative 'parser/token-round' ## round (& group)
11
+ require_relative 'parser/token-date'
12
+ require_relative 'parser/token-score'
13
+ require_relative 'parser/token-goals'
14
+ require_relative 'parser/token'
15
+
16
+ require_relative 'parser/parser'
17
+
18
+ require_relative 'parser/linter'
19
+
20
+
21
+