sportdb-parser 0.6.20 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +14 -8
  4. data/Rakefile +1 -1
  5. data/lib/sportdb/parser/blocktxt.rb +99 -0
  6. data/lib/sportdb/parser/lexer.rb +958 -395
  7. data/lib/sportdb/parser/lexer_buffer.rb +97 -0
  8. data/lib/sportdb/parser/lexer_tty.rb +111 -0
  9. data/lib/sportdb/parser/parser.rb +1768 -855
  10. data/lib/sportdb/parser/racc_parser.rb +1 -1
  11. data/lib/sportdb/parser/racc_tree.rb +327 -41
  12. data/lib/sportdb/parser/token-date.rb +160 -178
  13. data/lib/sportdb/parser/token-date_duration.rb +190 -0
  14. data/lib/sportdb/parser/token-geo.rb +59 -59
  15. data/lib/sportdb/parser/token-goals.rb +460 -0
  16. data/lib/sportdb/parser/token-group.rb +43 -0
  17. data/lib/sportdb/parser/token-note.rb +40 -0
  18. data/lib/sportdb/parser/token-prop.rb +70 -54
  19. data/lib/sportdb/parser/token-prop_name.rb +74 -0
  20. data/lib/sportdb/parser/token-round.rb +102 -0
  21. data/lib/sportdb/parser/token-score.rb +323 -47
  22. data/lib/sportdb/parser/token-score_fuller.rb +435 -0
  23. data/lib/sportdb/parser/token-score_legs.rb +59 -0
  24. data/lib/sportdb/parser/token-status.rb +157 -160
  25. data/lib/sportdb/parser/token-table.rb +149 -0
  26. data/lib/sportdb/parser/token-text.rb +72 -23
  27. data/lib/sportdb/parser/token-time.rb +141 -0
  28. data/lib/sportdb/parser/token.rb +242 -105
  29. data/lib/sportdb/parser/token_helpers.rb +92 -0
  30. data/lib/sportdb/parser/version.rb +2 -2
  31. data/lib/sportdb/parser.rb +24 -2
  32. metadata +18 -18
  33. data/config/rounds_de.txt +0 -125
  34. data/config/rounds_en.txt +0 -29
  35. data/config/rounds_es.txt +0 -26
  36. data/config/rounds_misc.txt +0 -25
  37. data/config/rounds_pt.txt +0 -4
  38. data/config/zones_en.txt +0 -20
  39. data/lib/sportdb/parser/lang.rb +0 -298
  40. data/lib/sportdb/parser/token-minute.rb +0 -205
@@ -25,194 +25,386 @@ end
25
25
  ## for now for compatibility
26
26
  def is_group?( text ) Lang.is_group?( text ); end
27
27
  def is_round?( text ) Lang.is_round?( text ); end
28
- def is_leg?( text ) Lang.is_leg?( text ); end
29
- def is_zone?( text ) Lang.is_zone?( text ); end
28
+
30
29
 
31
- ## transforms
32
- ##
33
- ## Netherlands 1-2 (1-1) England
34
- ## => text => team
35
- ## score|vs
36
- ## text => team
37
-
38
-
39
- ## token iter/find better name
40
- ## e.g. TokenBuffer/Scanner or such ??
41
- class Tokens
42
- def initialize( tokens )
43
- @tokens = tokens
44
- @pos = 0
45
- end
46
30
 
47
- def pos() @pos; end
48
- def eos?() @pos >= @tokens.size; end
49
31
 
32
+ def debug?() @debug == true; end
50
33
 
51
- def include?( *types )
52
- pos = @pos
53
- ## puts " starting include? #{types.inspect} @ #{pos}"
54
- while pos < @tokens.size do
55
- return true if types.include?( @tokens[pos][0] )
56
- pos +=1
57
- end
58
- false
59
- end
34
+ def initialize( lines, debug: false )
35
+ raise ArgumentError, "(string) text expected for lexer; got #{lines.class.name}" unless lines.is_a?(String)
36
+
37
+ @debug = debug
38
+ @txt = lines
39
+ end
60
40
 
61
- ## pattern e.g. [:TEXT, [:VS,:SCORE], :TEXT]
62
- def match?( *pattern )
63
- ## puts " starting match? #{pattern.inspect} @ #{@pos}"
64
- pattern.each_with_index do |types,offset|
65
- ## if single symbol wrap in array
66
- types = types.is_a?(Array) ? types : [types]
67
- return false unless types.include?( peek(offset) )
68
- end
69
- true
70
- end
71
41
 
42
+ HTML_COMMENT_RE = %r{ <!--
43
+ .*? ## note - use non-greedy/lazy *? match
44
+ -->
45
+ }xm ## note - turn on multi-line match (for dot (.))
72
46
 
73
- ## return token type (e.g. :TEXT, :NUM, etc.)
74
- def cur() peek(0); end
75
- ## return content (assumed to be text)
76
- def text(offset=0)
77
- ## raise error - why? why not?
78
- ## return nil?
79
- if peek( offset ) != :text
80
- raise ArgumentError, "text(#{offset}) - token not a text type"
81
- end
82
- @tokens[@pos+offset][1]
83
- end
84
47
 
48
+ ##
49
+ ## note - [] block may NOT incl. square brackets
50
+ ## what about comments (e.g. #)?
51
+ ## todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
52
+ PREPROC_BLOCK_RE = %r{ \[
53
+ [^\[\]\#]*? ## note - use non-greedy/lazy *? match
54
+ \]
55
+ }xm ## note - turn on multi-line match (for dot(.))
85
56
 
86
- def peek(offset=1)
87
- ## return nil if eos
88
- if @pos+offset >= @tokens.size
89
- nil
90
- else
91
- @tokens[@pos+offset][0]
92
- end
93
- end
94
57
 
95
- ## note - returns complete token
96
- def next
97
- # if @pos >= @tokens.size
98
- # raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
99
- # end
100
- # throw (standard) end of iteration here why? why not?
58
+ ##
59
+ ## check for "literal" (multi-line) note blocks
60
+ ## eg. nb: or note:
61
+ ## space required after double colon - why? why not?
62
+ PREPROC_NOTA_BENE_RE = %r{
63
+ ^
64
+ [ ]* (?: nb | note) [ ]* : [ ]+
65
+ .+? ## non-greedy
66
+
67
+ ## positive lookahead
68
+ ## note - must end with blank line or end-of-file/document
69
+ ## note - do NOT eat-up trailing hrule (---)
70
+ (?= (?: \n [ ]* -{3,} [ ]*)?
71
+ \n[ ]*\n
72
+ | \z
73
+ )
74
+ }xim
101
75
 
102
- t = @tokens[@pos]
103
- @pos += 1
104
- t
105
- end
76
+ ##
77
+ ## replace "escaped" newline with non-newline char e.g. '↵'
78
+ LINE_CONTINUATION_RE = %r{
79
+ \\[ ]* \n
80
+ }x
106
81
 
107
- def collect( &blk )
108
- tokens = []
109
- loop do
110
- break if eos?
111
- tokens << if block_given?
112
- blk.call( self.next )
113
- else
114
- self.next
115
- end
116
- end
117
- tokens
118
- end
119
- end # class Tokens
120
82
 
121
83
 
84
+ ###
85
+ ## check for magic comments
86
+ ## e.g # teletype: true or TELETYPE: TRUE
87
+ ## tty/teletype
88
+
89
+ MAGIC_COMMENT_RE = %r{ \A
90
+ [ ]* ## optional leading spaces
91
+ \#+ ## note - allow ##,###, etc. too
92
+ [ ]* ## optional spaces
93
+ (?<magic_comment_key> tty | teletype )
94
+ [ ]* ## optional spaces
95
+ :
96
+ [ ]* ## optional spaces
97
+ (?<magic_comment_value> true | false )
98
+ [ ]* ## optional trailing spaces
99
+ \z
100
+ }ix
101
+
122
102
 
123
103
 
124
- def debug?() @debug == true; end
125
104
 
126
- def initialize( lines, debug: false )
127
- @debug = debug
128
105
 
129
- ## note - for convenience - add support
130
- ## comments (incl. inline end-of-line comments) and empty lines here
131
- ## why? why not?
132
- ## why? keeps handling "centralized" here in one place
133
106
 
134
- ## todo/fix - rework and make simpler
135
- ## no need to double join array of string to txt etc.
107
+ def tokenize_with_errors
108
+
109
+ ####
110
+ ## flags / modes
111
+ @teletype = false # use magic comment - tty/teletype: true
136
112
 
137
- txt_pre = if lines.is_a?( Array )
138
- ## join together with newline
139
- lines.reduce( String.new ) do |mem,line|
140
- mem << line; mem << "\n"; mem
141
- end
142
- else ## assume single-all-in-one txt
143
- lines
144
- end
145
113
 
146
- ## preprocess automagically - why? why not?
114
+
115
+ tokens_by_line = [] ## note: add tokens line-by-line (flatten later)
116
+ errors = [] ## keep a list of errors - why? why not?
117
+
118
+ ## preprocess automagically - why? why not?
147
119
  ## strip lines with comments and empty lines striped / removed
148
120
  ## keep empty lines? why? why not?
149
121
  ## keep leading spaces (indent) - why?
150
122
  ##
151
123
  ## note - KEEP empty lines (get turned into BLANK token!!!!)
152
124
 
153
- @txt = String.new
154
- txt_pre.each_line do |line| ## preprocess
155
- line = line.strip
156
- next if line.start_with?('#') ### skip comments
157
-
158
- line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
159
-
160
- @txt << line
161
- @txt << "\n"
125
+
126
+ ## "universal" newlines
127
+ ## replace all windows-style cr+lf (\r\n) to lf (\n) only
128
+ txt = @txt.gsub( "\r\n", "\n" )
129
+
130
+
131
+
132
+ ###
133
+ ## quick hack for now
134
+ ## remove html-style comments <!-- -->
135
+ ## (incl. multi-line) with two spaces
136
+ ## will mess-up lineno tracking!!!
137
+ ## fix later to have function lineno & colno!!!
138
+ txt = @txt.gsub( HTML_COMMENT_RE ) do |m|
139
+ puts " [debug] preproc html comment:"
140
+ puts m
141
+ ' '
142
+ end
143
+
144
+
145
+ =begin
146
+ ##
147
+ ## todo/fix - add a command line switch/option for auto-format fixes !!!
148
+ ## quick hack - remove later
149
+ ## auto-convert "old" legacy round markers (»)
150
+ txt = txt.gsub( %r{^ [ ]*
151
+ »
152
+ (?= [ ]+) ## require one trailing space for now!!
153
+ }ix ) do |_|
154
+ puts "!! WARN - auto-fix format; replacing old (alternate/legacy) round marker (»)"
155
+ '▪'
156
+ end
157
+
158
+
159
+ ### 16.00 => 16:00
160
+ ## todo/check - use space for positive lookbehind & ahead
161
+ ## (instead of \b) - why? why not?
162
+ ## note - check for/exclude 12.12. date in match
163
+ ## use negative lookahead
164
+ ## check for 12.12.94
165
+ ## use positive lookbehind !!!
166
+ ## must be space, comma or begin-of-line [ ,]|^
167
+ ## or use negative lookbehind
168
+ ## must NOT be dot
169
+ txt = txt.gsub( %r{
170
+ ## check NEGATIVE lookbehind
171
+ (?<! [.]) ## do NOT match 12.94 in 12.12.94
172
+ \b
173
+ (?<h>\d{1,2})
174
+ \.
175
+ (?<m>\d{2})
176
+ \b
177
+ (?! [.] ) ## do NOT match 12.12.
178
+ }ix ) do |_|
179
+ m = $~ ## is $LAST_MATCH_DATA
180
+ puts "!! WARN - auto-fix format; replacing old (alternate/legacy) time format #{m[0]}"
181
+ "#{m[:h]}:#{m[:m]}" ## '\1:\2'
182
+ end
183
+ =end
184
+
185
+
186
+
187
+
188
+ ###
189
+ ## add more "native" multi-line comment-styles
190
+ ## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >>
191
+ ## or such - why? why not?
192
+
193
+
194
+ txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
195
+ if m.include?( "\n" ) ## check for newlines (\n) and replace
196
+ puts " [debug] preproc (multi-line) note/nota bene block:"
197
+ puts m
198
+ ## todo/check: replace with two spaces insead of ↵ - why? why not?
199
+ m.gsub( "\n", '↵' )
200
+ else
201
+ m
202
+ end
162
203
  end
163
- end
164
204
 
165
205
 
206
+ ##
207
+ ## e.g. used in (multi-line) TableNote
208
+ ## 1.SOUTH KOREA 6 5 1 0 22- 1 16 [0-0]
209
+ ## 2.LEBANON 6 3 1 2 11- 8 10 [0-2, 0-0]
210
+ ## 3.Turkmenistan 6 3 0 3 8-11 9 [3-1]
211
+ ## 4.Sri Lanka 6 0 0 6 2-23 0 [0-1]
212
+ ## -.North Korea [withdrew after playing 5 matches due to safety concerns in
213
+ ## connection with the Covid-19 pandemic; all results annulled]
214
+ ##
215
+ ## note - no longer used for now
216
+ ## enclose multi-line notes in []
217
+ ## removes need for line continuation for now
166
218
 
167
- def tokenize_with_errors
168
- tokens_by_line = [] ## note: add tokens line-by-line (flatten later)
169
- errors = [] ## keep a list of errors - why? why not?
219
+ ##
220
+ ## txt = txt.gsub( LINE_CONTINUATION_RE ) do |_|
221
+ ## puts " [debug] preproc line continuation"
222
+ ## ## todo/check: replace with two spaces insead of ↵ - why? why not?
223
+ ## '↵'
224
+ ## end
225
+
226
+
227
+
228
+ #####
229
+ ## (another) quick hack for now
230
+ ## turn multi-line note blocks into
231
+ ## single-line note blocks
232
+ ## by changing newline (\n) to ⏎ (unicode U+23CE)
233
+ ## or why not to ___ ?
234
+ ##
235
+ ## unicode options for return/arrows:
236
+ ## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
237
+ ## This is the most common "carriage return" symbol.
238
+ ## - ⏎ (U+23CE): Return Symbol.
239
+ ## Specifically designated as the keyboard's "Return" key symbol,
240
+ ## often used in user interfaces.
241
+
242
+ txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
243
+ if m.include?( "\n" ) ## check for newlines (\n) and replace
244
+ puts " [debug] preproc (multi-line) block:"
245
+ puts m
246
+ ## todo/check: replace with two spaces insead of ↵ - why? why not?
247
+ m.gsub( "\n", '↵' )
248
+ else
249
+ m
250
+ end
251
+ end
252
+
253
+
254
+ ####
255
+ ## quick hack - keep re state/mode between tokenize calls!!!
256
+ @re ||= RE ## note - switch between RE & INSIDE_RE
170
257
 
171
- @txt.each_line do |line|
172
- line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!!
173
-
174
- more_tokens, more_errors = _tokenize_line( line )
175
-
176
- tokens_by_line << more_tokens
177
- errors += more_errors
178
- end # each line
179
258
 
180
- tokens_by_line = tokens_by_line.map do |tokens|
181
- #############
182
- ## pass 1
183
- ## replace all texts with keyword matches
184
- ## (e.g. group, round, leg, etc.)
259
+ txt.each_line do |line|
260
+ ## line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!!
261
+ line = line.strip ## note - strip leading AND trailing whitespaces
262
+ ## note - trailing whitespace may incl. \n or \r\n!!!
263
+
264
+
185
265
  ##
186
- ## note - let is_round? get first (before is_group?)
187
- ## will match group stage as round (NOT group)
188
- tokens = tokens.map do |t|
189
- if t[0] == :TEXT
190
- text = t[1]
191
- t = if is_round?( text ) || is_leg?( text ) || is_zone?( text )
192
- [:ROUND, text]
193
- elsif is_group?( text )
194
- [:GROUP, text]
195
- else
196
- t ## pass through as-is (1:1)
197
- end
198
- end
199
- t
266
+ ###
267
+ ## check for magic comments
268
+ ## e.g # teletype: true or TELETYPE: TRUE
269
+ ## tty/teletype
270
+
271
+ if line.start_with?('#') ### skip comments (& check magic comments!!)
272
+
273
+ if (m = MAGIC_COMMENT_RE.match(line))
274
+ magic_comment_key = m[:magic_comment_key].downcase
275
+ magic_comment_value = m[:magic_comment_value].downcase
276
+
277
+ ## turn on teletype mode
278
+ ## e.g. tty: true or teletype: true
279
+ if ['tty', 'teletype'].include?( magic_comment_key ) &&
280
+ ['true'].include?( magic_comment_value )
281
+ puts " magic comment - turn on teletype (tty) mode"
282
+ @teletype = true
283
+ end
284
+ end
285
+
286
+ next
287
+ end
288
+
289
+ line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
290
+
291
+
292
+ ####
293
+ # support __END__ marker to cut-off input
294
+ break if line.strip == '__END__'
295
+
296
+
297
+
298
+ ##
299
+ ## first check for tabs
300
+ ## add error/warn
301
+ ## for auto-fix - replace tabs with two spaces
302
+
303
+ line = line.gsub( "\t" ) do |_|
304
+ ## report error here
305
+ ## todo/add error here
306
+ puts "!! WARN - auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}"
307
+ " " ## replace with two spaces
200
308
  end
201
309
 
202
- ### check for "section" starters e.g. Teams or such
203
- t = tokens[0]
204
- if t[0] == :TEXT
205
- text = t[1]
206
- if text =~ /^teams$/i
207
- t[0] = :TEAMS
208
- elsif text =~ /^blank$/i ### todo/fix -- remove!!! add real blanks!!
209
- t[0] = :BLANK
210
- else
310
+
311
+ ## U+00A0 (160) -- non-breaking space (unicode)
312
+ line = line.gsub( "\u00A0" ) do |uni|
313
+ ## report error here
314
+ ## todo/add error here
315
+ puts "!! WARN - auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}"
316
+ " " ## replace with space
317
+ end
318
+
319
+ ###
320
+ ## todo/fix - print unicode numbers for [–−]
321
+ ## different candidates to differentiate and document!!!
322
+ ## – => U+2013 (8211) -- En Dash (unicode)
323
+ ## − => U+2212 (8722) -- Minus Sign (unicode)
324
+ line = line.gsub( /[–−]/ ) do |uni|
325
+ ## report error here
326
+ ## todo/add error here
327
+ puts "!! WARN - auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}"
328
+ '-' ## replace with ascii dash (-)
329
+ end
330
+
331
+
332
+
333
+ puts "line: >#{line}<" if debug?
334
+
335
+ ######
336
+ ### special case for empty line (aka BLANK)
337
+ if line.empty?
338
+ ## note - blank always resets parser mode to std/top-level!!!
339
+ @re = RE
340
+ tokens_by_line << [[:BLANK, '<|BLANK|>']]
341
+ elsif (m = HEADING_RE.match(line))
342
+ ## note - heading always resets parser mode to std/top-level!!!
343
+ @re = RE
344
+ puts " HEADING" if debug?
345
+ ## note - derive heading level from no of (leading) markers
346
+ ## e.g. = is 1, == is 2, == is 3, etc.
347
+ heading_level = m[:heading_marker].size
348
+ tokens_by_line << [[:"H#{heading_level}", m[:heading]]]
349
+ elsif (m = NOTA_BENE_RE.match(line))
350
+ ## note - nota bene always resets parser mode to std/top-level!!!
351
+ @re = RE
352
+ tokens_by_line << [[:NOTA_BENE, m[:nota_bene]]]
353
+ elsif @re == RE && (m = TABLE_RE.match(line))
354
+ @re = TABLE_MORE_RE ## switch into table mode
355
+ if m[:table_heading]
356
+ tokens_by_line << [[:TABLE_HEADING, m[:table_heading]]]
357
+ else ## assume table (line) e.g. m[:table]
358
+ tokens_by_line << [[:TABLE_LINE, line]]
359
+ end
360
+ elsif @re == TABLE_MORE_RE
361
+ ### todo/fix - check if no match and report/add error!!
362
+ ## for now (ummatched) line gets auto-added as table line!!!
363
+ ##
364
+ ## note - MUST be followed by blank line (or nota bene/heading)
365
+ ## to switch back into to top-level!!!!
366
+ m = TABLE_MORE_RE.match(line)
367
+ if m[:table_note]
368
+ tokens_by_line << [[:TABLE_NOTE, m[:table_note]]]
369
+ elsif m[:table_divider]
370
+ tokens_by_line << [[:TABLE_DIVIDER, m[:table_divider]]]
371
+ else ## assume table (line) e.g. m[:table]
372
+ tokens_by_line << [[:TABLE_LINE, line]]
211
373
  end
374
+ elsif @re != TABLE_MORE_RE && (m = HRULER_RE.match(line))
375
+ ## note - hruler (---)
376
+ ## will only match if NOT in table mode!!!
377
+ ## otherwise
378
+ ## hruler always resets parser mode to std/top-level!!!
379
+ @re = RE
380
+ tokens_by_line << [[:HRULER, '<|HRULER|>']]
381
+ elsif @teletype && (@re == RE && IS_TTY_LINE_RE.match(line))
382
+ ## try experimental TELETYPE (TTY) mode!!!
383
+ ## note - turn on via magic comment e.g. tty/teletype: true
384
+ ###
385
+ ### move inside _tokenize_line - why? why not?
386
+
387
+
388
+ tokens_by_line << _tokenize_tty_line( line )
389
+
390
+ ## note - dates such as
391
+ ## APR 11 or 11 APR will trigger TELETYPE
392
+ ### ## check letter
393
+ else
394
+
395
+ more_tokens, more_errors = _tokenize_line( line )
396
+
397
+ tokens_by_line << more_tokens
398
+ errors += more_errors
212
399
  end
400
+ end # each line
401
+
402
+
213
403
 
404
+
405
+
406
+ tokens_by_line = tokens_by_line.map do |tokens|
214
407
  #################
215
- ## pass 2
216
408
  ## transform tokens (using simple patterns)
217
409
  ## to help along the (racc look ahead 1 - LA1) parser
218
410
  nodes = []
@@ -220,48 +412,72 @@ def tokenize_with_errors
220
412
  buf = Tokens.new( tokens )
221
413
  ## pp buf
222
414
 
223
-
224
415
  loop do
225
416
  break if buf.eos?
226
417
 
227
- if buf.pos == 0 ## MUST start line
228
- ## check for
229
- ## group def or round def
230
- if buf.match?( :ROUND, :'|' ) ## assume round def (change round to round_def)
231
- nodes << [:ROUND_DEF, buf.next[1]]
232
- nodes << buf.next
233
- nodes += buf.collect
234
- break
235
- end
236
- if buf.match?( :GROUP, :'|' ) ## assume group def (change group to group_def)
237
- nodes << [:GROUP_DEF, buf.next[1]]
238
- nodes << buf.next
239
- ## change all text to team - why? why not?
240
- nodes += buf.collect { |t|
241
- t[0] == :TEXT ? [:TEAM, t[1]] : t
242
- }
243
- break
244
- end
245
- end
246
-
247
-
248
- if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT )
249
- nodes << [:TEAM, buf.next[1]]
250
- nodes << buf.next
251
- nodes << [:TEAM, buf.next[1]]
252
- # note - now handled (upstream) with GOAL_RE mode!!!
253
- # elsif buf.match?( :TEXT, :MINUTE )
254
- # nodes << [:PLAYER, buf.next[1]]
255
- # nodes << buf.next
256
- elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
418
+ if buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
257
419
  date = buf.next[1]
258
420
  time = buf.next[1]
259
421
  ## puts "DATETIME:"
260
422
  ## pp date, time
423
+ ## note: time value is { time: {} } or
424
+ ## { time: {}, time_local {} }
261
425
  val = [date[0] + ' ' + time[0], ## concat string of two tokens
262
- { date: date[1], time: time[1] }
426
+ { date: date[1] }.merge( time[1] )
427
+ ]
428
+ nodes << [:DATETIME, val]
429
+ ### support date time with comma too - why? why not?
430
+ elsif buf.match?( :DATE, :',', :TIME )
431
+ date = buf.next[1]
432
+ _ = buf.next ## ignore comma
433
+ time = buf.next[1]
434
+ ## puts "DATETIME:"
435
+ ## pp date, time
436
+ val = [date[0] + ', ' + time[0], ## concat string of two tokens
437
+ { date: date[1] }.merge( time[1] )
438
+ ]
439
+ nodes << [:DATETIME, val]
440
+ elsif buf.match?( :TEAM, :SCORE_TEAM )
441
+ ## merge TEAM SCORE_TEAM into TEAMALT
442
+ ## (use TEAMENTRY or TEAMRESULT - why? why not?)
443
+ team = buf.next[1]
444
+ score_team = buf.next[1]
445
+ val = [team + ' ' + score_team[0], ## concat string of two tokens
446
+ { team: team }.merge( score_team[1] )
447
+ ]
448
+ nodes << [:TEAMALT, val]
449
+ elsif buf.match?( :TEAM, :SCORE_TEAM_PEN )
450
+ team = buf.next[1]
451
+ score_team_pen = buf.next[1]
452
+ val = [team + ' ' + score_team_pen[0], ## concat string of two tokens
453
+ { team: team }.merge( score_team_pen[1] )
454
+ ]
455
+ nodes << [:TEAMALT_PEN, val]
456
+ elsif buf.match?( :TEAM, :SCORE_TEAM_NUM )
457
+ team = buf.next[1]
458
+ score_team_num = buf.next[1]
459
+ val = [team + ' ' + score_team_num[0], ## concat string of two tokens
460
+ { team: team }.merge( score_team_num[1] )
263
461
  ]
264
- nodes << [:DATETIME, val]
462
+ nodes << [:TEAMALT_NUM, val]
463
+ elsif buf.match?( :GOAL_MINUTE, :',', :GOAL_MINUTE )
464
+ ## note - only advance by two tokens!
465
+ ## allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
466
+ ##
467
+ ## help parser with comma shift/reduce conflict
468
+ ## change ',' to GOAL_MINUTE_SEP !!!
469
+ nodes << buf.next ## pass through goal_minute
470
+ _ = buf.next ## eat-up goal_minute_sep a.k.a. comma (,)
471
+ ## and replace with dedicated sep(arator)
472
+ nodes << [:GOAL_MINUTE_SEP,"<|GOAL_MINUTE_SEP|>"]
473
+ elsif buf.match?( :',', :INLINE_ATTENDANCE )
474
+ ## note - allow optional comma before inline attendance
475
+ ## help parser with comma shift/reduce conflict
476
+ ## change ',' to INLINE_ATTENDANCE_SEP !!!
477
+ nodes << [:INLINE_ATTENDANCE_SEP, "<|INLINE_ATTENDANCE_SEP|>"]
478
+ _ = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,)
479
+ ## and replace with dedicated sep(arator)
480
+ nodes << buf.next ## pass through inline_attendance
265
481
  else
266
482
  ## pass through
267
483
  nodes << buf.next
@@ -271,6 +487,7 @@ def tokenize_with_errors
271
487
  end # map tokens_by_line
272
488
 
273
489
 
490
+
274
491
 
275
492
  ## flatten tokens
276
493
  tokens = []
@@ -280,9 +497,49 @@ def tokenize_with_errors
280
497
  pp tok
281
498
  end
282
499
 
500
+
501
+ ###############
502
+ ## "hacky" (automagic) line merges (remove newline)
503
+ ## if line start with @ - check if incl. teams
504
+
505
+ ###
506
+ ### quick merge lines hack
507
+ ## if line starts with geo-marker token @
508
+ ## check if line incl. TEAM
509
+ ## if yes, leave alone
510
+ ## otherwise merge line into previous line!!
511
+ ## - todo/fix - handle in possibly in grammar!!!
512
+ ## for now match_line CAN start with @ London
513
+ ## resulting in parser conflict(s)!!!
514
+ ## e.g.
515
+ ## England v Scotland
516
+ ## @ London
517
+ ## =>
518
+ ## England v Scotland @ London
519
+ ##
520
+
521
+ ##
522
+ ## note/todo - if INDENT / SPACES get added
523
+ ## adjust here
524
+ ## tok[0][0] == :INDENT (or :SPACES) &&
525
+ ## tok[1][0] == :'@'
526
+
527
+ if tok[0] && tok[0][0] == :'@'
528
+ team = tok.find { |t| t[0] == :TEAM }
529
+ if team
530
+ ## do nothing - keep as is (assume match_line starting w/ @)
531
+ else
532
+ ## no team(s) found in line
533
+ ## remove last token (that is, NEWLINE)
534
+ ## note - possibly is blank ?! keep blank
535
+ tokens.pop if tokens[-1][0] == :NEWLINE
536
+ end
537
+ end
538
+
539
+
283
540
  tokens += tok
284
541
  ## auto-add newlines (unless BLANK!!)
285
- tokens << [:NEWLINE, "\n"] unless tok[0][0] == :BLANK
542
+ tokens << [:NEWLINE, "\n"] unless tok[0] && tok[0][0] == :BLANK
286
543
  end
287
544
 
288
545
  [tokens,errors]
@@ -290,42 +547,11 @@ end # method tokenize_with_errors
290
547
 
291
548
 
292
549
 
293
- ### add a QUICK_PLAYER_WITH_MINUTE check
294
- QUICK_PLAYER_WITH_MINUTE_RE = %r{
295
- ## note - \b NOT working for ? !!!
296
- ##
297
- ## use positive lookbehind
298
- (?<= [ ,;\(\)\[\]]|^)
299
-
300
- (?:
301
- (?:
302
- \d{1,3} ## constrain numbers to 0 to 999!!!
303
- (?: \+\d{1,3}
304
- )?
305
- )
306
- |
307
- (?: \?{2} | _{2} ) ## add support for n/a (not/available)
308
- )
309
- ' ## must have minute marker!!!!
310
- }ix
311
-
312
550
 
313
551
  def _tokenize_line( line )
314
552
  tokens = []
315
553
  errors = [] ## keep a list of errors - why? why not?
316
554
 
317
- puts "line: >#{line}<" if debug?
318
-
319
-
320
- ### special case for empty line (aka BLANK)
321
- if line.empty?
322
- ## note - blank always resets parser mode to std/top-level!!!
323
- @re = RE
324
-
325
- tokens << [:BLANK, '<|BLANK|>']
326
- return [tokens, errors]
327
- end
328
-
329
555
 
330
556
  pos = 0
331
557
  ## track last offsets - to report error on no match
@@ -333,6 +559,9 @@ def _tokenize_line( line )
333
559
  offsets = [0,0]
334
560
  m = nil
335
561
 
562
+ ## track number of geo text seen
563
+ ## (use for - do NOT break on two spaces if no geo text seen yet!!)
564
+ geo_count = 0
336
565
 
337
566
  ####
338
567
  ## quick hack - keep re state/mode between tokenize calls!!!
@@ -342,34 +571,76 @@ def _tokenize_line( line )
342
571
  if @re == RE ## top-level
343
572
  ### check for modes once (per line) here to speed-up parsing
344
573
  ### for now goals only possible for start of line!!
345
- ### fix - remove optional [] - why? why not?
346
-
347
- ## start with prop key (match will switch into prop mode!!!)
348
- ## - fix - remove leading spaces in regex (upstream) - why? why not?
349
- if (m = PROP_KEY_RE.match( line ))
574
+ ### fix - remove optional [] - why? why not?
575
+
576
+ ####
577
+ ## note - ord e.g. (45) for match number can only start a (match) line
578
+ ## "inline" use NOT possible
579
+ ## note - ord (for ordinal number!!!) e.g match number (1), (42), etc.
580
+ if (m = START_WITH_ORD.match(line))
581
+ ## note - strip enclosing () and convert to integer
582
+ tokens << [:ORD, [m[:ord], { value: m[:value].to_i(10) } ]]
583
+
584
+ offsets = [m.begin(0), m.end(0)]
585
+ pos = offsets[1] ## update pos
586
+ elsif (m = START_WITH_YEAR.match(line))
587
+ ## note - strip enclosing () and convert to integer
588
+ tokens << [:YEAR, m[:year].to_i(10)]
589
+
590
+ offsets = [m.begin(0), m.end(0)]
591
+ pos = offsets[1] ## update pos
592
+
593
+ ###
594
+ ## todo/fix - rename to START_GROUP_DEF_LINE_RE !!!!
595
+ elsif (m = GROUP_DEF_LINE_RE.match( line ))
596
+ puts " ENTER GROUP_DEF_RE MODE" if debug?
597
+ @re = GROUP_DEF_RE
598
+
599
+ tokens << [:GROUP_DEF, m[:group_def]]
600
+
601
+ offsets = [m.begin(0), m.end(0)]
602
+ pos = offsets[1] ## update pos
603
+
604
+ ### todo/fix - rename to PROP_KEY_RE to START_WITH_PROP_KEY_RE !!!
605
+ elsif (m = PROP_KEY_RE.match( line ))
606
+ ## start with prop key (match will switch into prop mode!!!)
607
+ ## - fix - remove leading spaces in regex (upstream) - why? why not?
608
+ ##
350
609
  ### switch into new mode
351
610
  ## switch context to PROP_RE
352
611
  puts " ENTER PROP_RE MODE" if debug?
353
612
  key = m[:key]
354
613
 
355
614
 
356
- ### todo - add prop yellow/red cards too - why? why not?
357
- if ['sent off', 'red cards'].include?( key.downcase)
615
+ ### todo/fix - add prop yellow/red cards too - why? why not?
616
+ ## todo/fix - separate sent off and red card
617
+ ## sent-off - incl. red card, yellow/red card and the era before red cards!!
618
+ if ['sent off'].include?( key.downcase)
619
+ @re = PROP_CARDS_RE ## use CARDS_RE ???
620
+ tokens << [:PROP_SENTOFF, m[:key]]
621
+ elsif ['red cards'].include?( key.downcase )
358
622
  @re = PROP_CARDS_RE ## use CARDS_RE ???
359
623
  tokens << [:PROP_REDCARDS, m[:key]]
360
624
  elsif ['yellow cards'].include?( key.downcase )
361
625
  @re = PROP_CARDS_RE
362
626
  tokens << [:PROP_YELLOWCARDS, m[:key]]
363
- elsif ['ref', 'referee'].include?( key.downcase )
627
+ elsif ['ref', 'referee',
628
+ 'refs', 'referees' ## note - allow/support assistant refs
629
+ ].include?( key.downcase )
364
630
  @re = PROP_REFEREE_RE
365
631
  tokens << [:PROP_REFEREE, m[:key]]
366
632
  elsif ['att', 'attn', 'attendance'].include?( key.downcase )
367
633
  @re = PROP_ATTENDANCE_RE
368
634
  tokens << [:PROP_ATTENDANCE, m[:key]]
369
- elsif ['goals'].include?( key.downcase )
370
- @re = PROP_GOAL_RE
371
- tokens << [:PROP_GOALS, m[:key]]
372
- elsif ['penalties', 'penalty shootout'].include?( key.downcase )
635
+
636
+ # elsif ['goals'].include?( key.downcase )
637
+ # @re = PROP_GOAL_RE
638
+ # tokens << [:PROP_GOALS, m[:key]]
639
+
640
+ elsif ['penalties',
641
+ 'penalty shootout',
642
+ 'penalty shoot-out',
643
+ 'penalty kicks'].include?( key.downcase )
373
644
  @re = PROP_PENALTIES_RE
374
645
  tokens << [:PROP_PENALTIES, m[:key]]
375
646
  else ## assume (team) line-up
@@ -379,63 +650,69 @@ def _tokenize_line( line )
379
650
 
380
651
  offsets = [m.begin(0), m.end(0)]
381
652
  pos = offsets[1] ## update pos
653
+ ###
654
+ ### todo/fix
655
+ ### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
656
+ elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
657
+ puts " ENTER ROUND_DEF_RE MODE" if debug?
658
+ @re = ROUND_DEF_RE
659
+
660
+ ## note - return ROUND_DEF NOT ROUND_OUTLINE token
661
+ tokens << [:ROUND_DEF, m[:round_outline]]
662
+
663
+ offsets = [m.begin(0), m.end(0)]
664
+ pos = offsets[1] ## update pos
382
665
  elsif (m = ROUND_OUTLINE_RE.match( line ))
383
666
  puts " ROUND_OUTLINE" if debug?
667
+ ## note - derive round level from no of (leading) markers
668
+ ## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
669
+ ## note - ascii-style starts with double ::, thus, autodecrement by one!
670
+ round_level = m[:round_marker].size
671
+ round_level -= 1 if m[:round_marker].start_with?( '::' )
384
672
 
385
- tokens << [:ROUND_OUTLINE, m[:round_outline]]
673
+ tokens << [:ROUND_OUTLINE, [m[:round_outline],
674
+ { outline: m[:round_outline] ,
675
+ level: round_level}]]
386
676
 
387
677
  ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
388
678
  offsets = [m.begin(0), m.end(0)]
389
679
  pos = offsets[1] ## update pos
390
- elsif (m = PLAYER_WITH_SCORE_RE.match( line ))
391
- ## switch context to GOAL_RE (goalline(s)
392
- ## split token (automagically) into two!! - player AND minute!!!
393
- @re = GOAL_RE
394
- puts " ENTER GOAL_RE MODE" if debug?
395
-
396
- score = {}
397
- ## must always have ft for now e.g. 1-1 or such
398
- ### change to (generic) score from ft -
399
- ## might be score a.e.t. or such - why? why not?
400
- score[:ft] = [m[:ft1].to_i(10),
401
- m[:ft2].to_i(10)]
402
- ## note - for debugging keep (pass along) "literal" score
403
- tokens << [:SCORE, [m[:score], score]]
404
-
405
- ## auto-add player token
406
- tokens << [:PLAYER, m[:name]]
407
-
408
- offsets = [m.begin(0), m.end(0)]
409
- pos = offsets[1] ## update pos
680
+ elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume
681
+ ## switch context to GOAL_RE (goalline(s))
682
+ ####
683
+ ## note - check for alternate goal line styles / formats
684
+ if START_GOAL_LINE_COMPAT_RE.match(line )
685
+ ## "legacy" style starting with minute e.g.
686
+ ## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
687
+ ## 84 Rahn 3-2)
688
+ @re = GOAL_COMPAT_RE
689
+ puts " ENTER GOAL_COMPAT_RE MODE" if debug?
690
+
691
+ tokens << [:GOALS_COMPAT, "<|GOALS_COMPAT|>"]
692
+ elsif START_GOAL_LINE_ALT_RE.match( line )
693
+ ## goals with scores e.g.
694
+ ## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
695
+ ## -or-
696
+ ## (Dion Beljo 1-0
697
+ ## 1-1 Andreas Gruber
698
+ ## Matthias Seidl 2-1)
699
+ @re = GOAL_ALT_RE
700
+ puts " ENTER GOAL_ALT_RE MODE" if debug?
701
+
702
+ tokens << [:GOALS_ALT, "<|GOALS_ALT|>"]
703
+ else
704
+ ## "standard" / default style
705
+ @re = GOAL_RE
706
+ puts " ENTER GOAL_RE MODE" if debug?
410
707
 
411
- #### FIX/FIX/TODO
412
- ### looks to hang in player with minute
413
- ### FIX - improve / rework PLAYER_WITH_MINUTE_RE regex!!!!
414
- elsif (_quick = QUICK_PLAYER_WITH_MINUTE_RE.match(line) &&
415
- m = PLAYER_WITH_MINUTE_RE.match( line ))
416
- ## switch context to GOAL_RE (goalline(s)
417
- ## split token (automagically) into two!! - player AND minute!!!
418
- @re = GOAL_RE
419
- puts " ENTER GOAL_RE MODE" if debug?
420
-
421
- ## check for optional open_bracket
422
- tokens << [:'['] if m[:open_bracket]
423
-
424
- ## check for -; (none with separator)
425
- ## todo - find a better way? how possible?
426
- tokens << [:NONE, "<|NONE|>"] if m[:none]
427
-
428
- ## auto-add player token first
429
- tokens << [:PLAYER, m[:name]]
430
- ## minute props
431
- minute = {}
432
- minute[:m] = m[:value].to_i(10)
433
- minute[:offset] = m[:value2].to_i(10) if m[:value2]
434
- ## t is minute only
435
- tokens << [:MINUTE, [m[:minute], minute]]
708
+ tokens << [:GOALS, "<|GOALS|>"]
709
+ end
436
710
 
711
+ ## note - eat-up ( for now
712
+ ## pass along "virtual" GOALS or GOALS_ALT token
713
+ ## (see INLINE_GOALS for the starting goal line inline)
437
714
  offsets = [m.begin(0), m.end(0)]
438
- pos = offsets[1] ## update pos
715
+ pos = offsets[1] ## update pos
439
716
  end
440
717
  end
441
718
 
@@ -475,24 +752,105 @@ def _tokenize_line( line )
475
752
  ## note: racc requires pairs e.g. [:TOKEN, VAL]
476
753
  ## for VAL use "text" or ["text", { opts }] array
477
754
 
478
- t = if @re == GEO_RE
479
- ### note - possibly end inline geo on [ (and others?? in the future
480
- if m[:space] || m[:spaces]
481
- nil ## skip space(s)
482
- elsif m[:text]
483
- [:GEO, m[:text]] ## keep pos - why? why not?
484
- elsif m[:timezone]
485
- [:TIMEZONE, m[:timezone]]
486
- elsif m[:sym]
487
- sym = m[:sym]
488
- ## return symbols "inline" as is - why? why not?
489
- ## (?<sym>[;,@|\[\]-])
490
-
491
- case sym
492
- when ',' then [:',']
493
- when '›' then [:','] ## note - treat geo sep › (unicode) like comma for now!!!
494
- when '>' then [:','] ## note - treat geo sep > (ascii) like comma for now!!!
495
- when '[' then
755
+
756
+ t = if @re == ROUND_DEF_RE
757
+ if m[:spaces] || m[:space]
758
+ nil ## skip spaces
759
+ elsif m[:date]
760
+ [:DATE, [m[:date], _build_date( m )]]
761
+ elsif m[:duration]
762
+ [:DURATION, [m[:duration], _build_duration( m )]]
763
+ elsif m[:sym]
764
+ sym = m[:sym]
765
+ case sym
766
+ when '|' then [:'|']
767
+ when ':' then [:':']
768
+ when ',' then [:',']
769
+ else
770
+ puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
771
+ nil ## ignore others (e.g. brackets [])
772
+ end
773
+ elsif m[:any]
774
+ ## todo/check log error
775
+ msg = "parse error (tokenize round_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
776
+ puts "!! WARN - #{msg}"
777
+
778
+ errors << msg
779
+ log( "!! WARN - #{msg}" )
780
+
781
+ nil
782
+ else
783
+ ## report error/raise expection
784
+ puts "!!! TOKENIZE ERROR - no match found"
785
+ nil
786
+ end
787
+ elsif @re == GROUP_DEF_RE
788
+ if m[:spaces] || m[:space]
789
+ nil ## skip spaces
790
+ elsif m[:text]
791
+ [:TEAM, m[:text]]
792
+ elsif m[:sym]
793
+ sym = m[:sym]
794
+ case sym
795
+ when '|' then [:'|']
796
+ when ':' then [:':']
797
+ when ',' then [:',']
798
+ else
799
+ puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
800
+ nil ## ignore others (e.g. brackets [])
801
+ end
802
+ elsif m[:any]
803
+ ## todo/check log error
804
+ msg = "parse error (tokenize group_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
805
+ puts "!! WARN - #{msg}"
806
+
807
+ errors << msg
808
+ log( "!! WARN - #{msg}" )
809
+
810
+ nil
811
+ else
812
+ ## report error/raise expection
813
+ puts "!!! TOKENIZE ERROR - no match found"
814
+ nil
815
+ end
816
+ elsif @re == GEO_RE
817
+ ### note - possibly end inline geo on [ (and others?? in the future
818
+ ## note: break on double spaces e.g.
819
+ ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England
820
+ if m[:spaces]
821
+ ### note - do NOT break out
822
+ ## if not text seen yet!!!
823
+ if geo_count > 0
824
+ ## get out-off geo mode and backtrack (w/ next)
825
+ puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
826
+ @re = RE
827
+ pos = old_pos
828
+ next ## backtrack (resume new loop step)
829
+ else
830
+ nil ## skip spaces
831
+ end
832
+ elsif m[:space]
833
+ nil ## skip (single) space
834
+ elsif m[:text]
835
+ geo_count += 1
836
+ [:GEO, m[:text]] ## keep pos - why? why not?
837
+ elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!!
838
+ ## get out-off geo mode and backtrack (w/ next)
839
+ puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
840
+ @re = RE
841
+ pos = old_pos
842
+ next ## backtrack (resume new loop step)
843
+ elsif m[:sym]
844
+ sym = m[:sym]
845
+ ## return symbols "inline" as is - why? why not?
846
+ ## (?<sym>[;,@|\[\]-])
847
+ case sym
848
+ ## note - reset geo_count to 0 (avoids break on two spaces)
849
+ ## if separator seen!!
850
+ when ',' then geo_count = 0; [:',']
851
+ when '›' then geo_count = 0; [:','] ## note - treat geo sep › (unicode) like comma for now!!!
852
+ when '>' then geo_count = 0; [:','] ## note - treat geo sep > (ascii) like comma for now!!!
853
+ when '[' then
496
854
  ## get out-off geo mode and backtrack (w/ next)
497
855
  puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
498
856
  @re = RE
@@ -554,19 +912,29 @@ def _tokenize_line( line )
554
912
  ## report error - for unknown (inline) prop key in lineup
555
913
  nil
556
914
  end
915
+ elsif m[:inline_captain]
916
+ [:INLINE_CAPTAIN, m[:inline_captain]]
917
+ elsif m[:inline_yellow]
918
+ card = {}
919
+ card[:m] = m[:minute].to_i(10) if m[:minute]
920
+ card[:offset] = m[:offset].to_i(10) if m[:offset]
921
+ [:INLINE_YELLOW, [m[:inline_yellow], card]]
922
+ elsif m[:inline_red]
923
+ card = {}
924
+ card[:m] = m[:minute].to_i(10) if m[:minute]
925
+ card[:offset] = m[:offset].to_i(10) if m[:offset]
926
+ [:INLINE_RED, [m[:inline_red], card]]
927
+ elsif m[:inline_yellow_red]
928
+ card = {}
929
+ card[:m] = m[:minute].to_i(10) if m[:minute]
930
+ card[:offset] = m[:offset].to_i(10) if m[:offset]
931
+ [:INLINE_YELLOW_RED, [m[:inline_yellow_red], card]]
557
932
  elsif m[:prop_name]
558
- if m[:name] == 'Y'
559
- [:YELLOW_CARD, m[:name]]
560
- elsif m[:name] == 'R'
561
- [:RED_CARD, m[:name]]
562
- else
563
- [:PROP_NAME, m[:name]]
564
- end
933
+ [:PROP_NAME, m[:name]]
565
934
  elsif m[:minute]
566
935
  minute = {}
567
936
  minute[:m] = m[:value].to_i(10)
568
937
  minute[:offset] = m[:value2].to_i(10) if m[:value2]
569
- ## note - for debugging keep (pass along) "literal" minute
570
938
  [:MINUTE, [m[:minute], minute]]
571
939
  elsif m[:sym]
572
940
  sym = m[:sym]
@@ -661,9 +1029,8 @@ def _tokenize_line( line )
661
1029
  ## must always have ft for now e.g. 1-1 or such
662
1030
  ### change to (generic) score from ft -
663
1031
  ## might be score a.e.t. or such - why? why not?
664
- score[:ft] = [m[:ft1].to_i(10),
665
- m[:ft2].to_i(10)]
666
- ## note - for debugging keep (pass along) "literal" score
1032
+ score[:score] = [m[:score1].to_i(10),
1033
+ m[:score2].to_i(10)]
667
1034
  [:SCORE, [m[:score], score]]
668
1035
  elsif m[:sym]
669
1036
  sym = m[:sym]
@@ -680,30 +1047,107 @@ def _tokenize_line( line )
680
1047
  puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
681
1048
  nil
682
1049
  end
683
- elsif @re == GOAL_RE || @re == PROP_GOAL_RE
1050
+ elsif @re == GOAL_COMPAT_RE
684
1051
  if m[:space] || m[:spaces]
685
1052
  nil ## skip space(s)
686
1053
  elsif m[:prop_name] ## note - change prop_name to player
687
1054
  [:PLAYER, m[:name]]
688
1055
  elsif m[:minute]
689
- minute = {}
690
- minute[:m] = m[:value].to_i(10)
691
- minute[:offset] = m[:value2].to_i(10) if m[:value2]
692
- ## note - for debugging keep (pass along) "literal" minute
1056
+ minute = _build_minute( m )
693
1057
  [:MINUTE, [m[:minute], minute]]
1058
+ elsif m[:goal_type]
1059
+ goal_type = _build_goal_type( m )
1060
+ [:GOAL_TYPE, [m[:goal_type], goal_type]]
694
1061
  elsif m[:score]
695
- score = {}
696
- ## must always have ft for now e.g. 1-1 or such
697
- ### change to (generic) score from ft -
698
- ## might be score a.e.t. or such - why? why not?
699
- score[:ft] = [m[:ft1].to_i(10),
700
- m[:ft2].to_i(10)]
701
- ## note - for debugging keep (pass along) "literal" score
702
- [:SCORE, [m[:score], score]]
703
- elsif m[:og]
704
- [:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
705
- elsif m[:pen]
706
- [:PEN, m[:pen]]
1062
+ score = {}
1063
+ ## note - score is "generic"
1064
+ ## might be full-time (ft) or
1065
+ ## after extra-time (aet) or such
1066
+ ## or even undecided/unknown
1067
+ ## thus, use score1/score2 and NOT ft1/ft2
1068
+ score[:score] = [m[:score1].to_i(10),
1069
+ m[:score2].to_i(10)]
1070
+ ## note - for debugging keep (pass along) "literal" score
1071
+ [:SCORE, [m[:score], score]]
1072
+ elsif m[:sym]
1073
+ sym = m[:sym]
1074
+ ## return symbols "inline" as is - why? why not?
1075
+ ## (?<sym>[;,@|\[\]-])
1076
+
1077
+ case sym
1078
+ when ',' then [:',']
1079
+ when ')' ## leave goal mode!!
1080
+ puts " LEAVE GOAL_COMPAT_RE MODE" if debug?
1081
+ @re = RE
1082
+ ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
1083
+ ## or GOAL_PAREN_CLOSE/END ???
1084
+ [:GOALS_END, '<|GOALS_END|>']
1085
+ else
1086
+ nil ## ignore others (e.g. brackets [])
1087
+ end
1088
+ else
1089
+ ## report error
1090
+ puts "!!! TOKENIZE ERROR (GOAL_COMPAT_RE) - no match found"
1091
+ nil
1092
+ end
1093
+ elsif @re == GOAL_ALT_RE
1094
+ if m[:space] || m[:spaces]
1095
+ nil ## skip space(s)
1096
+ elsif m[:prop_name] ## note - change prop_name to player
1097
+ [:PLAYER, m[:name]]
1098
+ elsif m[:goal_minute]
1099
+ minute = _build_goal_minute( m )
1100
+ [:GOAL_MINUTE, [m[:goal_minute], minute]]
1101
+ elsif m[:goal_type]
1102
+ goal_type = _build_goal_type( m )
1103
+ [:GOAL_TYPE, [m[:goal_type], goal_type]]
1104
+ elsif m[:score]
1105
+ score = {}
1106
+ ## note - score is "generic"
1107
+ ## might be full-time (ft) or
1108
+ ## after extra-time (aet) or such
1109
+ ## or even undecided/unknown
1110
+ ## thus, use score1/score2 and NOT ft1/ft2
1111
+ score[:score] = [m[:score1].to_i(10),
1112
+ m[:score2].to_i(10)]
1113
+ ## note - for debugging keep (pass along) "literal" score
1114
+ [:SCORE, [m[:score], score]]
1115
+ elsif m[:sym]
1116
+ sym = m[:sym]
1117
+ ## return symbols "inline" as is - why? why not?
1118
+ ## (?<sym>[;,@|\[\]-])
1119
+
1120
+ case sym
1121
+ when ',' then [:',']
1122
+ when ')' ## leave goal mode!!
1123
+ puts " LEAVE GOAL_ALT_RE MODE" if debug?
1124
+ @re = RE
1125
+ ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
1126
+ ## or GOAL_PAREN_CLOSE/END ???
1127
+ [:GOALS_END, '<|GOALS_END|>']
1128
+ else
1129
+ nil ## ignore others (e.g. brackets [])
1130
+ end
1131
+ else
1132
+ ## report error
1133
+ puts "!!! TOKENIZE ERROR (GOAL_ALT_RE) - no match found"
1134
+ nil
1135
+ end
1136
+ elsif @re == GOAL_RE
1137
+ if m[:space] || m[:spaces]
1138
+ nil ## skip space(s)
1139
+ elsif m[:goals_none] ## note - eats-up semicolon!! e.g. -; or - ;
1140
+ [:GOALS_NONE, "<|GOALS_NONE|>"]
1141
+ elsif m[:goal_sep_alt]
1142
+ [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ] ## e.g. dash (-) WITH leading & trailing space required
1143
+ elsif m[:prop_name] ## note - change prop_name to player
1144
+ [:PLAYER, m[:name]]
1145
+ elsif m[:goal_minute]
1146
+ minute = _build_goal_minute( m )
1147
+ [:GOAL_MINUTE, [m[:goal_minute], minute]]
1148
+ elsif m[:goal_count]
1149
+ count = _build_goal_count( m )
1150
+ [:GOAL_COUNT, [m[:goal_count], count]]
707
1151
  elsif m[:sym]
708
1152
  sym = m[:sym]
709
1153
  ## return symbols "inline" as is - why? why not?
@@ -712,8 +1156,14 @@ def _tokenize_line( line )
712
1156
  case sym
713
1157
  when ',' then [:',']
714
1158
  when ';' then [:';']
715
- when '[' then [:'[']
716
- when ']' then [:']']
1159
+ # when '[' then [:'[']
1160
+ # when ']' then [:']']
1161
+ when ')' ## leave goal mode!!
1162
+ puts " LEAVE GOAL_RE MODE" if debug?
1163
+ @re = RE
1164
+ ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
1165
+ ## or GOAL_PAREN_CLOSE/END ???
1166
+ [:GOALS_END, '<|GOALS_END|>']
717
1167
  else
718
1168
  nil ## ignore others (e.g. brackets [])
719
1169
  end
@@ -728,74 +1178,112 @@ def _tokenize_line( line )
728
1178
  if m[:space] || m[:spaces]
729
1179
  nil ## skip space(s)
730
1180
  elsif m[:text]
731
- [:TEXT, m[:text]] ## keep pos - why? why not?
1181
+ ## note - top-level (for now always) assumes TEAM for TEXT match!!
1182
+ [:TEAM, m[:text]] ## keep pos - why? why not?
732
1183
  elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
733
- ## todo/check - add text (or status)
734
- # to opts hash {} by default (for value)
735
- if m[:status_note] ## includes note? e.g. awarded; originally 2-0
736
- [:STATUS, [m[:status], {status: m[:status],
737
- note: m[:status_note]} ]]
738
- else
739
- [:STATUS, [m[:status], {status: m[:status] } ]]
740
- end
1184
+ [:STATUS, [m[:status], _build_status( m ) ]]
1185
+ elsif m[:inline_wo] ## w/o - walkover (match status)
1186
+ [:INLINE_WO, m[:inline_wo]]
1187
+ elsif m[:inline_np] ## n/p - not played (match status)
1188
+ [:INLINE_NP, m[:inline_np]]
1189
+ elsif m[:inline_bye] ## bye (match status)
1190
+ [:INLINE_BYE, m[:inline_bye]]
1191
+ elsif m[:inline_abd] ## abd/abd. - abandoned (match status)
1192
+ [:INLINE_ABD, m[:inline_abd]]
1193
+ elsif m[:inline_void] ## abd/abd. - abandoned (match status)
1194
+ [:INLINE_VOID, m[:inline_void]]
1195
+ elsif m[:inline_susp] ## susp/susp. - suspended (match status)
1196
+ [:INLINE_SUSP, m[:inline_susp]]
1197
+ elsif m[:inline_ppd] ## ppd/ppd. or postp/postp. - postponed (match status)
1198
+ [:INLINE_PPD, m[:inline_ppd]]
1199
+ elsif m[:inline_awd] ## awd/awd. - awarded (match status)
1200
+ [:INLINE_AWD, m[:inline_awd]]
1201
+ elsif m[:inline_canc] ## canc/canc. - cancelled/canceled (match status)
1202
+ [:INLINE_CANC, m[:inline_canc]]
1203
+
1204
+ elsif m[:team_home]
1205
+ [:TEAM_HOME, m[:team_home]]
1206
+ elsif m[:team_away]
1207
+ [:TEAM_AWAY, m[:team_away]]
1208
+ elsif m[:team_neutral]
1209
+ [:TEAM_NEUTRAL, m[:team_neutral]]
1210
+
1211
+ elsif m[:attendance]
1212
+ att = {}
1213
+ att[:value] = m[:value].gsub( '_', '' ).to_i(10)
1214
+ ## note - for token id use INLINE_ATTENDANCE (ATTENDANCE in use for prop!!!)
1215
+ [:INLINE_ATTENDANCE, [m[:attendance], att ]]
741
1216
  elsif m[:note]
742
1217
  ### todo/check:
743
1218
  ## use value hash - why? why not? or simplify to:
744
1219
  ## [:NOTE, [m[:note], {note: m[:note] } ]]
745
1220
  [:NOTE, m[:note]]
746
- elsif m[:score_note]
747
- [:SCORE_NOTE, m[:score_note]]
748
1221
  elsif m[:time]
749
- ## unify to iso-format
750
- ### 12.40 => 12:40
751
- ## 12h40 => 12:40 etc.
752
- ## keep string (no time-only type in ruby)
753
- hour = m[:hour].to_i(10) ## allow 08/07/etc.
754
- minute = m[:minute].to_i(10)
755
- ## check if valid - 0:00 - 24:00
756
- ## check if 24:00 possible? or only 0:00 (23:59)
757
- if (hour >= 0 && hour <= 24) &&
758
- (minute >=0 && minute <= 59)
759
- ## note - for debugging keep (pass along) "literal" time
760
- ## might use/add support for am/pm later
761
- [:TIME, [m[:time], {h:hour,m:minute}]]
762
- else
763
- raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
764
- end
1222
+ [:TIME, [m[:time], _build_time(m)]]
765
1223
  elsif m[:date]
766
- date = {}
767
- ## map month names
768
- ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
769
- date[:y] = m[:year].to_i(10) if m[:year]
770
- ## check - use y too for two-digit year or keep separate - why? why not?
771
- date[:yy] = m[:yy].to_i(10) if m[:yy] ## two digit year (e.g. 25 or 78 etc.)
772
- date[:m] = m[:month].to_i(10) if m[:month]
773
- date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
774
- date[:d] = m[:day].to_i(10) if m[:day]
775
- date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
776
- ## note - for debugging keep (pass along) "literal" date
777
- [:DATE, [m[:date], date]]
778
- elsif m[:duration]
779
- ## todo/check/fix - if end: works for kwargs!!!!!
780
- duration = { start: {}, end: {}}
781
- duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
782
- duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
783
- duration[:start][:d] = m[:day1].to_i(10) if m[:day1]
784
- duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1]
785
- duration[:end][:y] = m[:year2].to_i(10) if m[:year2]
786
- duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2]
787
- duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
788
- duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
789
- ## note - for debugging keep (pass along) "literal" duration
790
- [:DURATION, [m[:duration], duration]]
791
- elsif m[:wday] ## standalone weekday e.g. Mo/Tu/We/etc.
792
- [:WDAY, [m[:wday], { wday: DAY_MAP[ m[:day_name].downcase ] } ]]
793
- elsif m[:num] ## fix - change to ord (for ordinal number!!!)
794
- ## note - strip enclosing () and convert to integer
795
- [:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
796
- elsif m[:score_more]
1224
+ [:DATE, [m[:date], _build_date(m)]]
1225
+ elsif m[:date_legs]
1226
+ [:DATE_LEGS, [m[:date_legs], _build_date_legs(m)]]
1227
+ elsif m[:score_team]
1228
+ [:SCORE_TEAM, [m[:score_team], _build_score_team(m)]]
1229
+ elsif m[:score_team_pen]
1230
+ [:SCORE_TEAM_PEN, [m[:score_team_pen], _build_score_team_pen(m)]]
1231
+ elsif m[:score_team_num]
1232
+ [:SCORE_TEAM_NUM, [m[:score_team_num], _build_score_team_num(m)]]
1233
+ elsif m[:score_legs]
1234
+ legs = {}
1235
+
1236
+ ### leg1
1237
+ score = {}
1238
+ score[:ft] = [m[:leg1_ft1].to_i(10),
1239
+ m[:leg1_ft2].to_i(10)]
1240
+ legs['leg1'] = score
1241
+
1242
+ ### leg2
1243
+ score = {}
1244
+ score[:ft] = [m[:leg2_ft1].to_i(10),
1245
+ m[:leg2_ft2].to_i(10)] if m[:leg2_ft1] && m[:leg2_ft2]
1246
+ score[:et] = [m[:leg2_et1].to_i(10),
1247
+ m[:leg2_et2].to_i(10)] if m[:leg2_et1] && m[:leg2_et2]
1248
+ score[:p] = [m[:leg2_p1].to_i(10),
1249
+ m[:leg2_p2].to_i(10)] if m[:leg2_p1] && m[:leg2_p2]
1250
+ legs['leg2'] = score
1251
+
1252
+ ## check for (opt) aggregate - keep on "top-level"
1253
+ legs[:agg] = [m[:agg1].to_i(10),
1254
+ m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1255
+ legs[:away] = true if m[:away]
1256
+
1257
+ ## note - for debugging keep (pass along) "literal" score
1258
+ [:SCORE_LEGS, [m[:score_legs], legs]]
1259
+ elsif m[:score_full]
1260
+ score = {}
1261
+ score[:p] = [m[:p1].to_i(10),
1262
+ m[:p2].to_i(10)] if m[:p1] && m[:p2]
1263
+ score[:et] = [m[:et1].to_i(10),
1264
+ m[:et2].to_i(10)] if m[:et1] && m[:et2]
1265
+ score[:ft] = [m[:ft1].to_i(10),
1266
+ m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
1267
+ score[:ht] = [m[:ht1].to_i(10),
1268
+ m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
1269
+ score[:agg] = [m[:agg1].to_i(10),
1270
+ m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1271
+
1272
+ if m[:away1] && m[:away2]
1273
+ score[:away] = [m[:away1].to_i(10),
1274
+ m[:away2].to_i(10)]
1275
+ elsif m[:away] ## fallback if no away score; check away flag
1276
+ score[:away] = true
1277
+ end
1278
+
1279
+ ## add golden/silver flags
1280
+ score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
1281
+ score[:silver] = true if m[:aetsg] ## silver goal (sg)
1282
+
1283
+ ## note - for debugging keep (pass along) "literal" score
1284
+ [:SCORE_FULL, [m[:score_full], score]]
1285
+ elsif m[:score_fuller]
797
1286
  score = {}
798
- ## check for pen
799
1287
  score[:p] = [m[:p1].to_i(10),
800
1288
  m[:p2].to_i(10)] if m[:p1] && m[:p2]
801
1289
  score[:et] = [m[:et1].to_i(10),
@@ -804,18 +1292,85 @@ def _tokenize_line( line )
804
1292
  m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
805
1293
  score[:ht] = [m[:ht1].to_i(10),
806
1294
  m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
1295
+ score[:agg] = [m[:agg1].to_i(10),
1296
+ m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1297
+ if m[:away1] && m[:away2]
1298
+ score[:away] = [m[:away1].to_i(10),
1299
+ m[:away2].to_i(10)]
1300
+ elsif m[:away] ## fallback if no away score; check away flag
1301
+ score[:away] = true
1302
+ end
1303
+
1304
+ ## add aet flag true/false
1305
+ # score[:aet] = true if m[:aet] || m[:aetgg] || m[:aetsg]
1306
+
1307
+ ## add golden/silver flags
1308
+ score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
1309
+ score[:silver] = true if m[:aetsg] ## silver goal (sg)
807
1310
 
808
1311
  ## note - for debugging keep (pass along) "literal" score
809
- [:SCORE_MORE, [m[:score_more], score]]
1312
+ [:SCORE_FULLER, [m[:score_fuller], score]]
1313
+ elsif m[:score_fuller_more]
1314
+ ## SCORE + SCORE_FULLER_MORE
1315
+ ## note - after extra-time (aet) or full-time (ft)
1316
+ ## score may be present in SCORE!!!
1317
+ score = {}
1318
+ score[:p] = [m[:p1].to_i(10),
1319
+ m[:p2].to_i(10)] if m[:p1] && m[:p2]
1320
+ score[:et] = [m[:et1].to_i(10),
1321
+ m[:et2].to_i(10)] if m[:et1] && m[:et2]
1322
+ score[:ft] = [m[:ft1].to_i(10),
1323
+ m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
1324
+ score[:ht] = [m[:ht1].to_i(10),
1325
+ m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
1326
+ score[:agg] = [m[:agg1].to_i(10),
1327
+ m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1328
+ if m[:away1] && m[:away2]
1329
+ score[:away] = [m[:away1].to_i(10),
1330
+ m[:away2].to_i(10)]
1331
+ elsif m[:away] ## fallback if no away score; check away flag
1332
+ score[:away] = true
1333
+ end
1334
+
1335
+ ## add flag in score for et/ft/ht
1336
+ score[:score] = 'et' if m[:aet] || m[:aetgg] || m[:aetsg]
1337
+ score[:score] = 'ft' if m[:ft]
1338
+ score[:score] = 'ht' if m[:ht]
1339
+
1340
+ ## add golden/silver flags
1341
+ score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
1342
+ score[:silver] = true if m[:aetsg] ## silver goal (sg)
1343
+
1344
+ ## note - for debugging keep (pass along) "literal" score
1345
+ [:SCORE_FULLER_MORE, [m[:score_fuller_more], score]]
810
1346
  elsif m[:score]
811
1347
  score = {}
812
- ## must always have ft for now e.g. 1-1 or such
813
- ### change to (generic) score from ft -
814
- ## might be score a.e.t. or such - why? why not?
815
- score[:ft] = [m[:ft1].to_i(10),
816
- m[:ft2].to_i(10)]
817
- ## note - for debugging keep (pass along) "literal" score
1348
+ ## note - score is "generic"
1349
+ ## might be full-time (ft) or
1350
+ ## after extra-time (aet) or such
1351
+ ## or even undecided/unknown
1352
+ ## thus, use score1/score2 and NOT ft1/ft2
1353
+ score[:score] = [m[:score1].to_i(10),
1354
+ m[:score2].to_i(10)]
1355
+ ## note - for debugging keep (pass along) "literal" score
818
1356
  [:SCORE, [m[:score], score]]
1357
+ elsif m[:score_awd] ## score awarded (awd/awd.)
1358
+ score = {}
1359
+ ### note - use "generic" score for now
1360
+ ## to match A 3-0 B [awarded] etc.
1361
+ score[:score] = [m[:score1].to_i(10),
1362
+ m[:score2].to_i(10)]
1363
+ ## add score[:awarded] = true ???
1364
+ ## or only use match status to avoid duplicate?
1365
+ [:SCORE_AWD, [m[:score_awd], score]]
1366
+ elsif m[:score_abd] ## score abandonded (abd/abd.)
1367
+ score = {}
1368
+ ### note - use "generic" score for now
1369
+ score[:score] = [m[:score1].to_i(10),
1370
+ m[:score2].to_i(10)]
1371
+ ## add score[:awarded] = true ???
1372
+ ## or only use match status to avoid duplicate?
1373
+ [:SCORE_ABD, [m[:score_abd], score]]
819
1374
  elsif m[:minute]
820
1375
  minute = {}
821
1376
  minute[:m] = m[:value].to_i(10)
@@ -833,6 +1388,7 @@ def _tokenize_line( line )
833
1388
  when '@' ## enter geo mode
834
1389
  puts " ENTER GEO_RE MODE" if debug?
835
1390
  @re = GEO_RE
1391
+ geo_count = 0
836
1392
  [:'@']
837
1393
  when ',' then [:',']
838
1394
  when ';' then [:';']
@@ -840,10 +1396,14 @@ def _tokenize_line( line )
840
1396
  when '|' then [:'|']
841
1397
  when '[' then [:'[']
842
1398
  when ']' then [:']']
843
- when '-' then [:'-'] # level 1 OR (classic) dash
844
- when '--' then [:'--'] # level 2
845
- when '---' then [:'---'] # level 3
846
- when '----' then [:'----'] # level 4
1399
+ when '-' then [:'-']
1400
+ when '(' ## enter goal scorer mode on "free-floating" open paranthesis!!!
1401
+ puts " ENTER GOAL_RE MODE" if debug?
1402
+ @re = GOAL_RE
1403
+ ## note - eat-up ( for now; do NOT pass along as token
1404
+ ## pass along "virutal" INLINE GOALS - why? why not?
1405
+ [:INLINE_GOALS, "<|INLINE_GOALS|>"]
1406
+ when ')' then [:')']
847
1407
  else
848
1408
  puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
849
1409
  nil ## ignore others (e.g. brackets [])
@@ -884,21 +1444,24 @@ def _tokenize_line( line )
884
1444
  end
885
1445
 
886
1446
 
887
- if @re == GOAL_RE ### ALWAYS switch back to top level mode
888
- puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
889
- @re = RE
890
- end
1447
+ # if @re == GOAL_RE ### ALWAYS switch back to top level mode
1448
+ # puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
1449
+ # @re = RE
1450
+ # end
891
1451
 
892
1452
  if @re == GEO_RE ### ALWAYS switch back to top level mode
893
1453
  puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
894
1454
  @re = RE
895
1455
  end
1456
+
1457
+ @re = RE if @re == GROUP_DEF_RE ### ALWAYS switch back to top level mode
1458
+ @re = RE if @re == ROUND_DEF_RE
896
1459
 
897
1460
  ##
898
1461
  ## if in prop mode continue if last token is [,-]
899
1462
  ## otherwise change back to "standard" mode
900
1463
  if @re == PROP_RE || @re == PROP_CARDS_RE ||
901
- @re == PROP_GOAL_RE || @re == PROP_PENALTIES_RE ||
1464
+ @re == PROP_PENALTIES_RE ||
902
1465
  @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
903
1466
  if [:',', :'-', :';'].include?( tokens[-1][0] )
904
1467
  ## continue/stay in PROP_RE mode