sportdb-parser 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +17 -4
  4. data/lib/sportdb/parser/lexer-on_goal.rb +172 -0
  5. data/lib/sportdb/parser/lexer-on_group_def.rb +31 -0
  6. data/lib/sportdb/parser/lexer-on_prop_lineup.rb +79 -0
  7. data/lib/sportdb/parser/lexer-on_prop_misc.rb +110 -0
  8. data/lib/sportdb/parser/lexer-on_prop_penalties.rb +40 -0
  9. data/lib/sportdb/parser/lexer-on_round_def.rb +37 -0
  10. data/lib/sportdb/parser/lexer-on_top.rb +125 -0
  11. data/lib/sportdb/parser/lexer-prep_doc.rb +131 -0
  12. data/lib/sportdb/parser/lexer-prep_line.rb +63 -0
  13. data/lib/sportdb/parser/lexer-tokenize.rb +449 -0
  14. data/lib/sportdb/parser/lexer.rb +133 -1363
  15. data/lib/sportdb/parser/lexer_buffer.rb +8 -37
  16. data/lib/sportdb/parser/lexer_token.rb +126 -0
  17. data/lib/sportdb/parser/parser.rb +1104 -1403
  18. data/lib/sportdb/parser/racc_parser.rb +36 -32
  19. data/lib/sportdb/parser/racc_tree.rb +65 -98
  20. data/lib/sportdb/parser/token-date--helpers.rb +130 -0
  21. data/lib/sportdb/parser/token-date--names.rb +108 -0
  22. data/lib/sportdb/parser/token-date.rb +20 -192
  23. data/lib/sportdb/parser/token-date_duration.rb +8 -27
  24. data/lib/sportdb/parser/token-geo.rb +16 -16
  25. data/lib/sportdb/parser/token-goals--helpers.rb +114 -0
  26. data/lib/sportdb/parser/token-goals.rb +103 -249
  27. data/lib/sportdb/parser/token-group.rb +8 -22
  28. data/lib/sportdb/parser/token-prop.rb +138 -124
  29. data/lib/sportdb/parser/token-prop_name.rb +48 -39
  30. data/lib/sportdb/parser/token-round.rb +21 -35
  31. data/lib/sportdb/parser/token-score--helpers.rb +189 -0
  32. data/lib/sportdb/parser/token-score.rb +9 -393
  33. data/lib/sportdb/parser/token-score_full.rb +331 -0
  34. data/lib/sportdb/parser/token-status.rb +44 -46
  35. data/lib/sportdb/parser/token-status_inline.rb +112 -0
  36. data/lib/sportdb/parser/token-text.rb +41 -31
  37. data/lib/sportdb/parser/token-time.rb +29 -26
  38. data/lib/sportdb/parser/token.rb +58 -159
  39. data/lib/sportdb/parser/version.rb +1 -1
  40. data/lib/sportdb/parser.rb +45 -17
  41. metadata +19 -6
  42. data/lib/sportdb/parser/blocktxt.rb +0 -99
  43. data/lib/sportdb/parser/lexer_tty.rb +0 -111
  44. data/lib/sportdb/parser/token-table.rb +0 -149
  45. data/lib/sportdb/parser/token_helpers.rb +0 -92
@@ -0,0 +1,131 @@
1
+ module SportDb
2
+ class Lexer
3
+
4
+
5
+
6
+ HTML_COMMENT_RE = %r{ <!--
7
+ .*? ## note - use non-greedy/lazy *? match
8
+ -->
9
+ }xm ## note - turn on multi-line (newline) match (for dot (.))
10
+
11
+
12
+ ##
13
+ ## check for "literal" (multi-line) note blocks
14
+ ## eg. nb: or note:
15
+ ## space required after double colon - why? why not?
16
+ PREPROC_NOTA_BENE_RE = %r{
17
+ ^
18
+ [ ]* (?: nb | note) [ ]* : [ ]+
19
+ .+? ## non-greedy
20
+
21
+ ## positive lookahead
22
+ ## note - must end with blank line or end-of-file/document
23
+ (?= \n[ ]*\n
24
+ | \z
25
+ )
26
+ }xim
27
+
28
+
29
+
30
+ ##
31
+ ## note - [] block may NOT incl. square brackets
32
+ ## what about comments (e.g. #)?
33
+ ## todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
34
+ PREPROC_BLOCK_RE = %r{ \[
35
+ [^\[\]\#]*? ## note - use non-greedy/lazy *? match
36
+ \]
37
+ }xm ## note - turn on multi-line match (for dot(.))
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+ def _prep_doc( txt )
46
+ ## preprocess automagically
47
+ ## strip html comments
48
+ ## keep empty lines? - yes (turn in BLANK tokens)
49
+ ## keep leading spaces (indent) - yes (maybe used later in upstream parser!!)
50
+ ##
51
+ ## note - KEEP empty lines (get turned into BLANK token!!!!)
52
+
53
+
54
+ ### normalize unicode (decomposed chars to composed chars)
55
+ ##
56
+ ## note: é is decomposed (in two chars e.g.)
57
+ ## e (101)
58
+ ## ́ (769)
59
+ ## vs
60
+ ## é (233)
61
+ txt = txt.unicode_normalize(:nfc)
62
+
63
+
64
+ ## "universal" newlines
65
+ ## replace all windows-style cr+lf (\r\n) to lf (\n) only
66
+ txt = txt.gsub( "\r\n", "\n" )
67
+
68
+
69
+
70
+ ###
71
+ ## quick hack for now
72
+ ## remove html-style comments <!-- -->
73
+ ## (incl. multi-line) with two spaces
74
+ ## will mess-up lineno tracking!!!
75
+ ## fix later to have function lineno & colno!!!
76
+ ##
77
+ ## todo/fix - why? why not?
78
+ ## to keep lineno intact
79
+ ## replace with space and newline
80
+
81
+ ###
82
+ ## add more "native" multi-line comment-styles
83
+ ## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >>
84
+ ## or such - why? why not?
85
+
86
+ txt = txt.gsub( HTML_COMMENT_RE ) do |m|
87
+ _trace('preproc html comment:', m )
88
+ ' '
89
+ end
90
+
91
+
92
+
93
+ txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
94
+ if m.include?( "\n" ) ## check for newlines (\n) and replace
95
+ _trace('preproc (multi-line) note/nota bene block:', m )
96
+ m.gsub( "\n", '↵' )
97
+ else
98
+ m
99
+ end
100
+ end
101
+
102
+
103
+ #####
104
+ ## (another) quick hack for now
105
+ ## turn multi-line note blocks into
106
+ ## single-line note blocks
107
+ ## by changing newline (\n) to ⏎ (unicode U+23CE)
108
+ ## or why not to ___ ?
109
+ ##
110
+ ## unicode options for return/arrows:
111
+ ## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
112
+ ## This is the most common "carriage return" symbol.
113
+ ## - ⏎ (U+23CE): Return Symbol.
114
+ ## Specifically designated as the keyboard's "Return" key symbol,
115
+ ## often used in user interfaces.
116
+
117
+ txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
118
+ if m.include?( "\n" ) ## check for newlines (\n) and replace
119
+ _trace( 'preproc (multi-line) block:', m )
120
+ m.gsub( "\n", '↵' )
121
+ else
122
+ m
123
+ end
124
+ end
125
+
126
+
127
+ txt
128
+ end
129
+
130
+ end # class Lexer
131
+ end # module SportDb
@@ -0,0 +1,63 @@
1
+ module SportDb
2
+ class Lexer
3
+
4
+
5
+ ######
6
+ ## auto-fix checks line-by-line
7
+
8
+ def _prep_line( line )
9
+
10
+ ##
11
+ ## first check for tabs
12
+ ## add error/warn
13
+ ## for auto-fix - replace tabs with two spaces
14
+
15
+ line = line.gsub( "\t" ) do |_|
16
+ ## report error here
17
+ ## todo/add error here
18
+ _warn( "auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}" )
19
+ ' ' ## replace with two spaces
20
+ end
21
+
22
+
23
+ ## U+00A0 (160) -- non-breaking space (unicode)
24
+ line = line.gsub( "\u00A0" ) do |uni|
25
+ ## report error here
26
+ ## todo/add error here
27
+ _warn( "auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}" )
28
+ ' ' ## replace with space
29
+ end
30
+
31
+ ###
32
+ ## todo/fix - print unicode numbers for [–−]
33
+ ## different candidates to differentiate and document!!!
34
+ ## – => U+2013 (8211) -- En Dash (unicode)
35
+ ## − => U+2212 (8722) -- Minus Sign (unicode)
36
+ line = line.gsub( /[–−]/ ) do |uni|
37
+ ## report error here
38
+ ## todo/add error here
39
+ _warn( "auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}" )
40
+ '-' ## replace with ascii dash (-)
41
+ end
42
+
43
+ #### add more unsmart quotes
44
+ ## smart quotes
45
+ line = line.gsub( /[‘’]/ ) do |uni|
46
+ ## report error here
47
+ ## todo/add error here
48
+ _warn( "auto-fix; replacing unicode (smart) quote (#{uni}/#{uni.ord}) w/ ascii quote ('/#{"'".ord}) in line #{line.inspect}" )
49
+ "'"
50
+ end
51
+
52
+ line = line.gsub( /[“”]/ ) do |uni|
53
+ ## report error here
54
+ ## todo/add error here
55
+ _warn( %Q{auto-fix; replacing unicode (smart) double quote (#{uni}/#{uni.ord}) w/ ascii double quote ("/#{'"'.ord}) in line #{line.inspect}} )
56
+ '"'
57
+ end
58
+
59
+ line
60
+ end
61
+
62
+ end # class Lexer
63
+ end # module SportDb
@@ -0,0 +1,449 @@
1
+ module SportDb
2
+ class Lexer
3
+
4
+
5
+ ###
6
+ ## use nested class for context - why? why not?
7
+ ## note: first arg passed in MUST be ref to lexer (instance)
8
+ class Context
9
+ ## passed along to on_round_def etc. handlers in tokenize_line
10
+ ## note - for now only offset (in line begin/end) gets updated !!!
11
+ attr_writer :offset
12
+ attr_reader :lineno
13
+
14
+ def initialize( lexer,
15
+ line:,
16
+ lineno:,
17
+ errors: )
18
+ @lexer = lexer
19
+ @line = line
20
+ @lineno = lineno
21
+ @errors = errors
22
+
23
+ @offset = [0,0] ## or use [] aka [nil,nil] for not defined??? why? why not?
24
+ ## @offset = offset ## MatchData offset e.g. [m.begin(0),m.end(0)]
25
+ end
26
+
27
+
28
+
29
+ def warn_on_else( match, mode: 'TOP' )
30
+ if match[:any]
31
+ _add_warn( "unexpected char >#{match[:any]}< (#{mode})" )
32
+ else
33
+ ## internal error - shouldn't really happen
34
+ _add_warn( "internal error - unknown match (#{mode}): #{match.inspect}")
35
+ end
36
+ end
37
+
38
+
39
+ def _add_warn( msg )
40
+ ## note - warns gets logged as error for now too
41
+ ## maybe add @warns later - why? why not?
42
+ ##
43
+ ## note - add +1 to offset (start at one - not zero-based)
44
+ ## will match with (external) text editors
45
+ msg = "parse error (tokenize) - " +
46
+ msg +
47
+ " in line @#{@lineno}:#{@offset[0]+1},#{@offset[1]+1} >#{@line}< "
48
+
49
+ @errors << msg
50
+ @lexer.log( "!! WARN - #{msg}" )
51
+
52
+ @lexer._warn( msg )
53
+ end
54
+
55
+ =begin
56
+ ## use report/log/??_parses_error
57
+ def _add_error( msg )
58
+ msg = "parse error (tokenize) -" +
59
+ msg +
60
+ " in line #{@lineno}@#{@offset[0]},#{@offse[1]} >#{@line}< "
61
+
62
+ @errors << msg
63
+ end
64
+ =end
65
+
66
+ end # class Context
67
+
68
+
69
+
70
+
71
+
72
+ def _tokenize_line( line, lineno )
73
+ tokens = []
74
+ errors = [] ## keep a list of errors - why? why not?
75
+
76
+
77
+ pos = 0 ## note - usually same as offset[1] aka offset[end] after match
78
+ ## track last offset (begin/end) - to report error on no match
79
+ ## or no match in end of string
80
+ offset = [0,0]
81
+ m = nil
82
+
83
+ ## track number of geo text seen
84
+ ## (use for - do NOT break on two spaces if no geo text seen yet!!)
85
+ @geo_count = 0
86
+
87
+ ####
88
+ ## quick hack - keep re state/mode between tokenize calls!!!
89
+ @re ||= RE ## note - switch between RE & INSIDE_RE
90
+
91
+
92
+ if @re == RE ## top-level
93
+ ### check for modes once (per line) here to speed-up parsing
94
+ ### for now goals only possible for start of line!!
95
+ ### fix - remove optional [] - why? why not?
96
+
97
+ ####
98
+ ## note - ord e.g. (45) for match number can only start a (match) line
99
+ ## "inline" use NOT possible
100
+ ## note - ord (for ordinal number!!!) e.g match number (1), (42), etc.
101
+ if (m = START_WITH_ORD.match(line))
102
+ ## note - strip enclosing () and convert to integer
103
+ tokens << Token.new(:ORD, m[:ord],
104
+ lineno: lineno, offset: m.offset(:ord),
105
+ value: m[:value].to_i(10) )
106
+
107
+ offset = m.offset(0)
108
+ pos = offset[1] ## update pos
109
+ elsif (m = START_WITH_YEAR.match(line))
110
+ tokens << Token.new(:YEAR, m[:year],
111
+ lineno: lineno, offset: m.offset(:year),
112
+ value: m[:year].to_i(10) )
113
+
114
+ offset = m.offset(0)
115
+ pos = offset[1] ## update pos
116
+
117
+ elsif (m = START_WITH_GROUP_DEF_LINE_RE.match( line ))
118
+ _trace( "ENTER GROUP_DEF_RE MODE" )
119
+ @re = GROUP_DEF_RE
120
+
121
+ tokens << Token.new( :GROUP_DEF, m[:group_def],
122
+ lineno: lineno, offset: m.offset(:group_def) )
123
+
124
+
125
+ offset = m.offset(0)
126
+ pos = offset[1] ## update pos
127
+
128
+ elsif (m = START_WITH_PROP_KEY_RE.match( line ))
129
+ ## start with prop key (match will switch into prop mode!!!)
130
+ ## - fix - remove leading spaces in regex (upstream) - why? why not?
131
+ ##
132
+ ### switch into new mode
133
+ ## switch context to PROP_RE
134
+ _trace("ENTER PROP_RE MODE" )
135
+ key = m[:key]
136
+
137
+
138
+ ### todo/fix - add prop yellow/red cards too - why? why not?
139
+ ## todo/fix - separate sent off and red card
140
+ ## sent-off - incl. red card, yellow/red card and the era before red cards!!
141
+ if ['sent off'].include?( key.downcase)
142
+ @re = PROP_CARDS_RE ## use CARDS_RE ???
143
+ tokens << Token.new(:PROP_SENTOFF, m[:key],
144
+ lineno: lineno, offset: m.offset(:key))
145
+ elsif ['red cards'].include?( key.downcase )
146
+ @re = PROP_CARDS_RE ## use CARDS_RE ???
147
+ tokens << Token.new(:PROP_REDCARDS, m[:key],
148
+ lineno: lineno, offset: m.offset(:key))
149
+ elsif ['yellow cards'].include?( key.downcase )
150
+ @re = PROP_CARDS_RE
151
+ tokens << Token.new(:PROP_YELLOWCARDS, m[:key],
152
+ lineno: lineno, offset: m.offset(:key))
153
+ elsif ['ref', 'referee',
154
+ 'refs', 'referees' ## note - allow/support assistant refs
155
+ ].include?( key.downcase )
156
+ @re = PROP_REFEREE_RE
157
+ tokens << Token.new(:PROP_REFEREE, m[:key],
158
+ lineno: lineno, offset: m.offset(:key))
159
+ elsif ['att', 'attn', 'attendance'].include?( key.downcase )
160
+ @re = PROP_ATTENDANCE_RE
161
+ tokens << Token.new(:PROP_ATTENDANCE, m[:key],
162
+ lineno: lineno, offset: m.offset(:key))
163
+
164
+ # elsif ['goals'].include?( key.downcase )
165
+ # @re = PROP_GOAL_RE
166
+ # tokens << [:PROP_GOALS, m[:key]]
167
+
168
+ elsif ['penalties',
169
+ 'penalty shootout',
170
+ 'penalty shoot-out',
171
+ 'penalty kicks'].include?( key.downcase )
172
+ @re = PROP_PENALTIES_RE
173
+ tokens << Token.new(:PROP_PENALTIES, m[:key],
174
+ lineno: lineno, offset: m.offset(:key))
175
+ else ## assume (team) line-up
176
+ @re = PROP_LINEUP_RE
177
+ ## fix-fix-fix - rename to PROP_LINEUP !!
178
+ tokens << Token.new(:PROP, m[:key],
179
+ lineno: lineno, offset: m.offset(:key))
180
+ end
181
+
182
+ offset = m.offset(0)
183
+ pos = offset[1] ## update pos
184
+ ###
185
+ ### todo/fix
186
+ ### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
187
+ elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
188
+ _trace( "ENTER ROUND_DEF_RE MODE" )
189
+ @re = ROUND_DEF_RE
190
+
191
+ ## note - return ROUND_DEF NOT ROUND_OUTLINE token
192
+ ## fix - add leading ▪ too!!
193
+ tokens << Token.new( :ROUND_DEF, m[:round_outline],
194
+ lineno: lineno, offset: m.offset(:round_outline))
195
+
196
+ offset = m.offset(0)
197
+ pos = offset[1] ## update pos
198
+ elsif (m = ROUND_OUTLINE_RE.match( line ))
199
+ _trace( "ROUND_OUTLINE" )
200
+ ## note - derive round level from no of (leading) markers
201
+ ## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
202
+ ## note - ascii-style starts with double ::, thus, autodecrement by one!
203
+ round_level = m[:round_marker].size
204
+ round_level -= 1 if m[:round_marker].start_with?( '::' )
205
+
206
+ tokens << Token.new( :ROUND_OUTLINE, m[:round_outline],
207
+ lineno: lineno, offset: m.offset(:round_outline),
208
+ value: { outline: m[:round_outline],
209
+ level: round_level})
210
+
211
+ ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
212
+ offset = m.offset(0)
213
+ pos = offset[1] ## update pos
214
+ elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume
215
+ ## switch context to GOAL_RE (goalline(s))
216
+ ####
217
+ ## note - check for alternate goal line styles / formats
218
+ if START_GOAL_LINE_COMPAT_RE.match(line )
219
+ ## "legacy" style starting with minute e.g.
220
+ ## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
221
+ ## 84 Rahn 3-2)
222
+ @re = GOAL_COMPAT_RE
223
+ _trace( "ENTER GOAL_COMPAT_RE MODE" )
224
+
225
+ tokens << Token.virtual( :GOALS_COMPAT, lineno: lineno )
226
+ elsif START_GOAL_LINE_ALT_RE.match( line )
227
+ ## goals with scores e.g.
228
+ ## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
229
+ ## -or-
230
+ ## (Dion Beljo 1-0
231
+ ## 1-1 Andreas Gruber
232
+ ## Matthias Seidl 2-1)
233
+ @re = GOAL_ALT_RE
234
+ _trace( "ENTER GOAL_ALT_RE MODE" )
235
+
236
+ tokens << Token.virtual( :GOALS_ALT, lineno: lineno )
237
+ else
238
+ ## "standard" / default style
239
+ @re = GOAL_RE
240
+ _trace( "ENTER GOAL_RE MODE" )
241
+
242
+ tokens << Token.virtual( :GOALS, lineno: lineno )
243
+ end
244
+
245
+ ## note - eat-up ( for now
246
+ ## pass along "virtual" GOALS or GOALS_ALT token
247
+ ## (see INLINE_GOALS for the starting goal line inline)
248
+ ##
249
+ ## fix-fix-fix
250
+ ## keep offset at [0,0] - why? why not?
251
+ ## do NOT eat-up
252
+ ## or better
253
+ ## add tokens << Token.literal( '(', lineno: lineno, offset: ...) !!!
254
+ offset = m.offset(0)
255
+ pos = offset[1] ## update pos
256
+ end
257
+ end
258
+
259
+
260
+
261
+ old_pos = -1 ## allows to backtrack to old pos (used in geo)
262
+
263
+
264
+
265
+
266
+ ctx = Context.new( self,
267
+ line: line,
268
+ lineno: lineno,
269
+ errors: errors )
270
+
271
+
272
+ while m = @re.match( line, pos )
273
+ # if debug?
274
+ # pp m
275
+ # puts "pos: #{pos}"
276
+ # end
277
+ offset = m.offset(0)
278
+ ctx.offset = offset
279
+
280
+
281
+
282
+ if offset[0] != pos
283
+ ## match NOT starting at start/begin position!!!
284
+ ## report parse error!!!
285
+ msg = "parse error (tokenize) - skipping >#{line[pos..(offset[0]-1)]}< in line #{lineno}@#{offset[0]},#{offset[1]} >#{line}<"
286
+ errors << msg
287
+
288
+ log( msg )
289
+ puts "!! WARN - #{msg}"
290
+ end
291
+
292
+
293
+ ##
294
+ ## todo/fix - also check if possible
295
+ ## if no match but not yet end off string!!!!
296
+ ## report skipped text run too!!!
297
+
298
+ old_pos = pos
299
+ pos = offset[1]
300
+
301
+ # pp offset if debug?
302
+
303
+ ##
304
+ ## note: racc requires pairs e.g. [:TOKEN, VAL]
305
+ ## for VAL use "text" or ["text", { opts }] array
306
+
307
+
308
+
309
+ t = if @re == ROUND_DEF_RE then _on_round_def( m, ctx: ctx )
310
+ elsif @re == GROUP_DEF_RE then _on_group_def( m, ctx: ctx )
311
+ elsif @re == GEO_RE
312
+ ### note - possibly end inline geo on [ (and others?? in the future
313
+ ## note: break on double spaces e.g.
314
+ ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England
315
+ if m[:spaces]
316
+ ### note - do NOT break out
317
+ ## if not text seen yet!!!
318
+ if @geo_count > 0
319
+ ## get out-off geo mode and backtrack (w/ next)
320
+ ##
321
+ ## todo/fix
322
+ ## add virtual geo_end token!!!
323
+ _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
324
+ @re = RE
325
+ pos = old_pos
326
+ next ## backtrack (resume new loop step)
327
+ else
328
+ nil ## skip spaces
329
+ end
330
+ elsif m[:space]
331
+ nil ## skip (single) space
332
+ elsif m[:text]
333
+ @geo_count += 1
334
+ ## keep pos - why? why not?
335
+ Token.new(:GEO, m[:text],
336
+ lineno: lineno, offset: m.offset(:text))
337
+ elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!!
338
+ ## get out-off geo mode and backtrack (w/ next)
339
+ ## todo/fix
340
+ ## add (semi-) virtual geo_end token!!!
341
+ _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
342
+ @re = RE
343
+ pos = old_pos
344
+ next ## backtrack (resume new loop step)
345
+ elsif m[:sym]
346
+ case m[:sym]
347
+ ## note - reset geo_count to 0 (avoids break on two spaces)
348
+ ## if separator seen!!
349
+ when ',' then @geo_count = 0
350
+ Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
351
+ when '›' then @geo_count = 0;
352
+ Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
353
+ ## note - treat geo sep › (unicode) like comma for now!!!
354
+ when '>' then @geo_count = 0;
355
+ Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
356
+ ## note - treat geo sep > (ascii) like comma for now!!!
357
+ when '[' then
358
+ ##
359
+ ## todo/fix
360
+ ## add virtual geo_end token!!!
361
+ ## get out-off geo mode and backtrack (w/ next)
362
+ _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
363
+ @re = RE
364
+ pos = old_pos
365
+ next ## backtrack (resume new loop step)
366
+ else
367
+ Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
368
+ end
369
+ else
370
+ ctx.warn_on_else( m, mode: 'GEO' )
371
+ nil
372
+ end
373
+ elsif @re == PROP_CARDS_RE then _on_prop_cards( m, ctx: ctx )
374
+ elsif @re == PROP_LINEUP_RE then _on_prop_lineup( m, ctx: ctx )
375
+ elsif @re == PROP_ATTENDANCE_RE then _on_prop_attendance( m, ctx: ctx )
376
+ elsif @re == PROP_REFEREE_RE then _on_prop_referee( m, ctx: ctx )
377
+ elsif @re == PROP_PENALTIES_RE then _on_prop_penalties( m, ctx: ctx )
378
+ elsif @re == GOAL_COMPAT_RE then _on_goal_compat( m, ctx: ctx )
379
+ elsif @re == GOAL_ALT_RE then _on_goal_alt( m, ctx: ctx )
380
+ elsif @re == GOAL_RE then _on_goal( m, ctx: ctx )
381
+ ###################################################
382
+ ## assume TOP_LEVEL (a.k.a. RE) machinery
383
+ else
384
+ _on_top( m, ctx: ctx )
385
+ end
386
+
387
+
388
+ tokens << t if t
389
+
390
+ # if debug?
391
+ # print ">"
392
+ # print "*" * pos
393
+ # puts "#{line[pos..-1]}<"
394
+ # end
395
+ end
396
+
397
+ ## check if no match in end of string
398
+ if offset[1] != line.size
399
+ msg = "parse error (tokenize) - skipping >#{line[offset[1]..-1]}< in line #{lineno}@#{offset[1]},#{line.size} >#{line}<"
400
+ errors << msg
401
+
402
+ log( msg )
403
+ puts "!! WARN - #{msg}"
404
+ end
405
+
406
+
407
+ # if @re == GOAL_RE ### ALWAYS switch back to top level mode
408
+ # puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
409
+ # @re = RE
410
+ # end
411
+
412
+ if @re == GEO_RE ### ALWAYS switch back to top level mode
413
+ _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
414
+ @re = RE
415
+ end
416
+
417
+ ### ALWAYS switch back to top level mode
418
+ @re = RE if @re == GROUP_DEF_RE ||
419
+ @re == ROUND_DEF_RE
420
+
421
+ ##
422
+ ## if in prop mode continue if last token is [,-]
423
+ ## otherwise change back to "standard" mode
424
+ if @re == PROP_LINEUP_RE ||
425
+ @re == PROP_CARDS_RE ||
426
+ @re == PROP_PENALTIES_RE ||
427
+ @re == PROP_ATTENDANCE_RE ||
428
+ @re == PROP_REFEREE_RE
429
+ if [',', '-', ';'].include?( tokens[-1].type)
430
+ ## continue/stay in PROP_RE mode
431
+ ## todo/check - auto-add PROP_CONT token or such
432
+ ## to help parser with possible NEWLINE
433
+ ## conflicts - why? why not?
434
+ else
435
+ ## switch back to top-level mode!!
436
+ _trace( "LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" )
437
+ @re = RE
438
+ ## note - auto-add PROP_END (<PROP_END>)
439
+ tokens << Token.virtual(:PROP_END, lineno: lineno)
440
+ end
441
+ end
442
+
443
+
444
+ [tokens,errors]
445
+ end
446
+
447
+
448
+ end ## class Lexer
449
+ end ## module SportDb