rsssf-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,102 @@
1
+
2
+
3
+ module Rsssf
4
+ class Parser
5
+
6
+ ## Group A-Z
7
+ ## Group 1-99
8
+ ## Group HEX # used in concaf world cup quali
9
+ ## Group 1A or A1, B1 - used anywhere
10
+ ##
11
+ ## use "key" of group - why? why not?
12
+
13
+ GROUP_RE = %r{(?<group>
14
+ \b
15
+ Group [ ]
16
+ [a-z0-9]+
17
+ \b)}ix
18
+
19
+
20
+ ROUND_RE = %r{(?<round>
21
+ \b
22
+ (?:
23
+ # round - note - requiers number e.g. round 1,2, etc.
24
+ (?: (?: Round |
25
+ Matchday |
26
+ Week
27
+ )
28
+ [ ] [0-9]+
29
+ )
30
+ |
31
+ # more (kockout) rounds
32
+ # playoffs - playoff, play-off, play-offs
33
+ (?: Play-?offs?
34
+ (?: [ ]for[ ]quarter-?finals )?
35
+ )
36
+ |
37
+ # round32
38
+ (?: Round[ ]of[ ]32 |
39
+ Last[ ]32 |
40
+ 16th[ ]finals |
41
+ 1/16[ ]finals
42
+ )
43
+ |
44
+ # round16
45
+ (?: Round[ ]of[ ]16 |
46
+ Last[ ]16 |
47
+ 8th[ ]finals |
48
+ 1/8[ ]finals
49
+ )
50
+ |
51
+ # fifthplace
52
+ (?:
53
+ (?: (Fifth|5th)[ -]place
54
+ (?: [ ] (?: match|play-?off|final ))?
55
+ ) |
56
+ (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
57
+ )
58
+ |
59
+ # thirdplace
60
+ (?:
61
+ (?: (Third|3rd)[ -]place
62
+ (?: [ ] (?: match|play-?off|final ))?
63
+ ) |
64
+ (?: Match[ ]for[ ](?: third|3rd )[ -]place )
65
+ )
66
+ |
67
+ # quarterfinals
68
+ (?:
69
+ Quarter-?finals? |
70
+ Quarters |
71
+ Last[ ]8
72
+ )
73
+ |
74
+ # semifinals
75
+ (?:
76
+ Semi-?finals? |
77
+ Semis |
78
+ Last[ ]4
79
+ )
80
+ |
81
+ # final
82
+ Finals?
83
+ )
84
+ \b)}ix
85
+
86
+ ##
87
+ ## keep leg separate (from round) - why? why not?
88
+ ##
89
+ LEG_RE = %r{ (?<leg>
90
+ \b
91
+ (?:
92
+ # leg1
93
+ (?: 1st|First)[ ]legs?
94
+ |
95
+ # leg2
96
+ (?: 2nd|Second)[ ]legs?
97
+ )
98
+ \b)}ix
99
+
100
+
101
+ end # class Parser
102
+ end # module Rsssf
@@ -0,0 +1,103 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ######
6
+ ## e.g. 2-1
7
+ SCORE_RE = %r{
8
+ (?<score>
9
+ (?<=[ ]) # Positive lookbehind for space
10
+ (?<score1>\d{1,2}) - (?<score2>\d{1,2})
11
+ (?=[ ]) # positive lookahead for space
12
+ )
13
+ }ix
14
+
15
+ ## [aet]
16
+ ## [aet, 3-2 pen]
17
+ ## [aet; 3-2 pen]
18
+ ## [3-2 pen]
19
+ ## [3-2 pen.]
20
+ ## [aet, 9-8 pen]
21
+ ## [aet, 5-3 pen]
22
+ ## [aet, 6-5 pen]
23
+ ## [aet]
24
+ ##
25
+ ## - add dot (.) too ??
26
+ ## [aet. 3-2 pen]
27
+
28
+
29
+ SCORE_EXT_RE = %r{ \[
30
+ (?<score_ext>
31
+ (?: ## aet only e.g. aet
32
+ aet
33
+ (?: ## optional pen
34
+ [,;][ ]*
35
+ \d{1,2}-\d{1,2} [ ]? pen\.?
36
+ )?
37
+ )
38
+ |
39
+ (?: ## penalty only e.g. 3-2 pen
40
+ \d{1,2}-\d{1,2} [ ]? pen\.?
41
+ )
42
+ )
43
+ \]
44
+ }ix
45
+
46
+ ### awd - awarded
47
+ SCORE_AWD_RE = %r{ ## must be space before and after!!!
48
+ (?<score_awd>
49
+ (?<=[ ]) # Positive lookbehind for space
50
+ awd
51
+ (?=[ ]) # positive lookahead for space
52
+ )
53
+ }ix
54
+
55
+ ### abd - abandoned
56
+ SCORE_ABD_RE = %r{ ## must be space before and after!!!
57
+ (?<score_abd>
58
+ (?<=[ ]) # Positive lookbehind for space
59
+ abd
60
+ (?=[ ]) # positive lookahead for space
61
+ )
62
+ }ix
63
+
64
+ ### ppd - postponed
65
+ SCORE_PPD_RE = %r{ ## must be space before and after!!!
66
+ (?<score_ppd>
67
+ (?<=[ ]) # Positive lookbehind for space
68
+ ppd
69
+ (?=[ ]) # positive lookahead for space
70
+ )
71
+ }ix
72
+
73
+
74
+ ### n/p - not played
75
+ SCORE_NP_RE = %r{ ## must be space before and after!!!
76
+ (?<score_np>
77
+ (?<=[ ]) # Positive lookbehind for space
78
+ n/p
79
+ (?=[ ]) # positive lookahead for space
80
+ )
81
+ }ix
82
+
83
+ ## A walkover, also W.O. or w/o (originally two words: "walk over"),
84
+ ## is awarded to the opposing team/player etc,
85
+ ## if there are no other players available,
86
+ ## or they have been disqualified,
87
+ ## because the other contestants have forfeited or
88
+ # the other contestants have withdrawn from the contest.
89
+ ##
90
+ ## w/o - walk over
91
+ SCORE_WO_RE = %r{ ## must be space before and after!!!
92
+ (?<score_wo>
93
+ (?<=[ ]) # Positive lookbehind for space
94
+ w/o
95
+ (?=[ ]) # positive lookahead for space
96
+ )
97
+ }ix
98
+
99
+
100
+
101
+ end # class Parser
102
+ end # module Rsssf
103
+
@@ -0,0 +1,162 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ## note - do NOT allow single alpha text for now
6
+ ## add later?? A - B C - D - why?
7
+ ## opt 1) one alpha
8
+ ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
+
10
+ ## opt 2) more than one alphanum
11
+
12
+
13
+ ### allow special case - starting text with number e.g.
14
+ ## number must be follow by space or dot ()
15
+ # 1 FC ## allow 1-FC or 1FC - why? why not?
16
+ # 1. FC
17
+ # 1.FC - XXXX - not allowed for now, parse error
18
+ # 1FC - XXXX - now allowed for now, parse error
19
+ # 1890 Munich
20
+ #
21
+
22
+
23
+ ##
24
+ # allow Cote'd Ivoir or such
25
+ ## e.g. add '
26
+
27
+
28
+ ## note - use a more strict text re(gex)
29
+ ## if inside brackets !!!!
30
+
31
+ ###
32
+ ## "simple" strict text regex
33
+ ### no numbers (or & or such inside)
34
+ ## allows dash/hyphen (-)
35
+ ## and dot (.) and apostroph (') for now
36
+
37
+
38
+ ## simple (double) quoted text
39
+ ## only supports a-z (unicode) PLUS (single) inline space
40
+ ## add more chars - why? why not?
41
+ TEXT_QUOTED = '(?: " ' +
42
+ ' \p{L}+ ' +
43
+ ' (?: [ ] ' +
44
+ ' \p{L}+ )* ' +
45
+ ' " ) '
46
+
47
+
48
+ ### might start with "" !!!
49
+ ## e.g.
50
+ ## "Tiago" Cardoso Mendes 80
51
+ ## "Cristiano Ronaldo" dos Santos Aveiro 74
52
+ ## "Zé Castro" José Eduardo Rosa Vale Castro 60og
53
+
54
+
55
+ TEXT_STRICT_RE = %r{
56
+ (?<text>
57
+ (?: \b | #{TEXT_QUOTED} [ ] ## note - leading quoted text must be followed by space!!
58
+ )
59
+ \p{L}+ ## all unicode letters (e.g. [a-z])
60
+
61
+ (?:
62
+ (?:[ ]
63
+ | # only single spaces allowed inline!!!
64
+ [-]
65
+ )?
66
+ (?:
67
+ \p{L}+ |
68
+ ['.] |
69
+ (?:
70
+ (?<= [ ])
71
+ #{TEXT_QUOTED}
72
+ (?= [ ]|$) ### must be followed by space
73
+ ## todo/fix - add all end of text lookaheads to (see below)
74
+ )
75
+ )
76
+ )*
77
+ ## must NOT end with space or dash(-)
78
+ ## todo/fix - possible in regex here
79
+ ## only end in alphanum a-z0-9 (not dot or & ???)
80
+
81
+ ## positive lookahead
82
+ ## cannot use \b if text ends in dot (.) or other non-alphnum
83
+ ## than \b will not work
84
+ (?=[ ,;@|\[\]]
85
+ |$
86
+ )
87
+ )
88
+ }ix
89
+
90
+
91
+
92
+ TEXT_RE = %r{
93
+ ## must start with alpha (allow unicode letters!!)
94
+ (?<text>
95
+ \b ## use/require word boundary
96
+ (?:
97
+ # opt 1 - start with alpha
98
+ \p{L}+ ## all unicode letters (e.g. [a-z])
99
+ |
100
+
101
+ # opt 2 - start with num!! - allow special case (e.g. 1. FC)
102
+ \d+ # check for num lookahead (MUST be space or dot)
103
+ ## MUST be followed by (optional dot) and
104
+ ## required space !!!
105
+ ## MUST be follow by a to z!!!!
106
+ \.? ## optional dot
107
+ [ ]? ## make space optional too - why? why not?
108
+ ## yes - eg. 1st, 2nd, 5th etc.
109
+ \p{L}+
110
+ )
111
+
112
+ (?:(?: (?:[ ]
113
+ (?! (awd|abd|ppd|n/p|w/o)[ ]) ## note - exclude (awd[ ]/abd[ ]/n/p[ ])
114
+ )
115
+ | # only single spaces allowed inline!!!
116
+ [-]
117
+ )?
118
+ (?:
119
+ \p{L}+ | [&/'.]
120
+ |
121
+ (?:
122
+ \d+
123
+ (?![0-9.:'/+-])
124
+ ## negative lookahead for numbers
125
+ ## note - include digits itself!!!
126
+ )
127
+ )
128
+ )* ## must NOT end with space or dash(-)
129
+ ## todo/fix - possible in regex here
130
+ ## only end in alphanum a-z0-9 (not dot or & ???)
131
+
132
+ ## support (Hamburg) or such at the end (ony)
133
+ ## note - no numbers allowed inside () for now!!
134
+ (?:
135
+ [ ]\(\p{L}+
136
+ (?:
137
+ (?: [ ] |
138
+ [-]
139
+ )?
140
+ \p{L}+ | [&/'.]
141
+ )*
142
+ \)
143
+ )?
144
+
145
+
146
+ ## add lookahead/lookbehind
147
+ ## must be space!!!
148
+ ## (or comma or start/end of string)
149
+ ## kind of \b !!!
150
+ ## positive lookahead
151
+ ## note - added : too - why? why not?
152
+ (?=[ ,;@|:\[\]]
153
+ |$
154
+ )
155
+ )
156
+ }ix
157
+
158
+
159
+
160
+ end # class Parser
161
+ end # module Rsssf
162
+
@@ -0,0 +1,230 @@
1
+
2
+
3
+ module Rsssf
4
+ class Parser
5
+
6
+
7
+
8
+ BASICS_RE = %r{
9
+ (?<spaces> [ ]{2,}) |
10
+ (?<space> [ ])
11
+ |
12
+ (?<sym>[;,@|\[\]])
13
+ }ix
14
+
15
+
16
+
17
+ VS_RE = %r{ ## must be space before and after!!!
18
+ (?<vs>
19
+ (?<=[ ]) # Positive lookbehind for space
20
+ -
21
+ (?=[ ]) # positive lookahead for space
22
+ )
23
+ }ix
24
+
25
+
26
+
27
+
28
+
29
+ RE = Regexp.union( GROUP_RE, ROUND_RE, LEG_RE,
30
+ DATE_RE,
31
+ VS_RE,
32
+ SCORE_RE,
33
+ SCORE_AWD_RE, SCORE_ABD_RE, SCORE_PPD_RE, SCORE_NP_RE,
34
+ SCORE_WO_RE,
35
+ SCORE_EXT_RE,
36
+ NOTE_RE,
37
+ BASICS_RE,
38
+ TEXT_RE )
39
+
40
+
41
+ ## "strict" text match mode inside brackets
42
+ ## ]
43
+ INSIDE_RE = Regexp.union( GOAL_OG_RE, GOAL_PEN_RE,
44
+ BASICS_RE,
45
+ TEXT_STRICT_RE,
46
+ MINUTE_RE,
47
+ )
48
+
49
+ def log( msg )
50
+ ## append msg to ./logs.txt
51
+ ## use ./errors.txt - why? why not?
52
+ File.open( './logs.txt', 'a:utf-8' ) do |f|
53
+ f.write( msg )
54
+ f.write( "\n" )
55
+ end
56
+ end
57
+
58
+
59
+
60
+ def tokenize_with_errors( line, debug: false )
61
+ tokens = []
62
+ errors = [] ## keep a list of errors - why? why not?
63
+
64
+ puts ">#{line}<" if debug
65
+
66
+ pos = 0
67
+ ## track last offsets - to report error on no match
68
+ ## or no match in end of string
69
+ offsets = [0,0]
70
+ m = nil
71
+
72
+ ####
73
+ ## quick hack - keep re state/mode between tokenize calls!!!
74
+ @re ||= RE ## note - switch between RE & INSIDE_RE
75
+
76
+
77
+ while m = @re.match( line, pos )
78
+ if debug
79
+ pp m
80
+ puts "pos: #{pos}"
81
+ end
82
+ offsets = [m.begin(0), m.end(0)]
83
+
84
+ if offsets[0] != pos
85
+ ## match NOT starting at start/begin position!!!
86
+ ## report parse error!!!
87
+ msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
88
+ puts msg
89
+
90
+ errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
91
+ log( msg )
92
+ end
93
+
94
+ ##
95
+ ## todo/fix - also check if possible
96
+ ## if no match but not yet end off string!!!!
97
+ ## report skipped text run too!!!
98
+
99
+ pos = offsets[1]
100
+
101
+ pp offsets if debug
102
+
103
+ t = if @re == INSIDE_RE
104
+ if m[:space]
105
+ nil ## skip space
106
+ elsif m[:spaces]
107
+ nil ## skip spaces
108
+ elsif m[:text]
109
+ [:text, m[:text]] ## keep pos - why? why not?
110
+ elsif m[:minute]
111
+ [:minute, m[:minute]]
112
+ elsif m[:og]
113
+ [:og, m[:og]] ## for typed drop - string version/variants
114
+ elsif m[:pen]
115
+ [:pen, m[:pen]]
116
+ elsif m[:sym]
117
+ sym = m[:sym]
118
+ ## return symbols "inline" as is - why? why not?
119
+ case sym
120
+ when ',' then [:',']
121
+ when ';' then [:';']
122
+ when '@' then [:'@']
123
+ when '|' then [:'|']
124
+ when '['
125
+ ## report error - already in inside mode!!!
126
+ nil
127
+ when ']'
128
+ puts " leave inside match mode"
129
+ @re = RE
130
+ nil
131
+ else
132
+ nil ## ignore others (e.g. brackets [])
133
+ end
134
+ else
135
+ ## report error - why? why not?
136
+ nil
137
+ end
138
+ else ## assume standard mode/ctx
139
+ if m[:space]
140
+ nil ## skip space
141
+ elsif m[:spaces]
142
+ nil ## skip spaces
143
+ elsif m[:text]
144
+ [:text, m[:text]] ## keep pos - why? why not?
145
+ elsif m[:note]
146
+ [:note, m[:note]]
147
+ elsif m[:group]
148
+ [:group, m[:group]]
149
+ elsif m[:round]
150
+ [:round, m[:round]]
151
+ elsif m[:leg]
152
+ [:leg, m[:leg]]
153
+ elsif m[:date]
154
+ [:date, m[:date]]
155
+ elsif m[:vs]
156
+ [:vs, m[:vs]]
157
+ elsif m[:score]
158
+ [:score, m[:score]]
159
+ elsif m[:score_awd] # awarded (awd)
160
+ [:score_awd, m[:score_awd]]
161
+ elsif m[:score_abd] # abandoned (abd)
162
+ [:score_abd, m[:score_abd]]
163
+ elsif m[:score_ppd] # postponed (ppd)
164
+ [:score_ppd, m[:score_ppd]]
165
+ elsif m[:score_np] # not played (n/p)
166
+ [:score_np, m[:score_np]]
167
+ elsif m[:score_wo] # walk over (w/o)
168
+ [:score_wo, m[:score_wo]]
169
+ elsif m[:score_ext]
170
+ [:score_ext, m[:score_ext]]
171
+ elsif m[:sym]
172
+ sym = m[:sym]
173
+ ## return symbols "inline" as is - why? why not?
174
+ case sym
175
+ when ',' then [:',']
176
+ when ';' then [:';']
177
+ when '@' then [:'@']
178
+ when '|' then [:'|']
179
+ when '['
180
+ ## switch to inside mode!!!
181
+ puts " enter inside match mode"
182
+ @re = INSIDE_RE
183
+ nil
184
+ when ']'
185
+ ## already in standard mode/ctx
186
+ ## report warn/error - why? why not?
187
+ nil
188
+ else
189
+ nil ## ignore others (e.g. brackets [])
190
+ end
191
+ else
192
+ ## report error - why? why not?
193
+ nil
194
+ end
195
+ end
196
+
197
+
198
+ tokens << t if t
199
+
200
+ if debug
201
+ print ">"
202
+ print "*" * pos
203
+ puts "#{line[pos..-1]}<"
204
+ end
205
+ end
206
+
207
+ ## check if no match in end of string
208
+ if offsets[1] != line.size
209
+ msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
210
+ puts msg
211
+ log( msg )
212
+
213
+ errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
214
+ end
215
+
216
+
217
+ [tokens,errors]
218
+ end
219
+
220
+
221
+ ### convience helper - ignore errors by default
222
+ def tokenize( line, debug: false )
223
+ tokens, _ = tokenize_with_errors( line, debug: debug )
224
+ tokens
225
+ end
226
+
227
+
228
+ end # class Parser
229
+ end # module Rsssf
230
+
@@ -0,0 +1,21 @@
1
+
2
+ ####
3
+ ## build on "standard" parse
4
+ require 'sportdb/parser'
5
+
6
+
7
+ ## our own code
8
+ require_relative 'parser/token-text'
9
+ require_relative 'parser/token-note'
10
+ require_relative 'parser/token-round' ## round (& group)
11
+ require_relative 'parser/token-date'
12
+ require_relative 'parser/token-score'
13
+ require_relative 'parser/token-goals'
14
+ require_relative 'parser/token'
15
+
16
+ require_relative 'parser/parser'
17
+
18
+ require_relative 'parser/linter'
19
+
20
+
21
+