sportdb-parser 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +17 -4
  4. data/lib/sportdb/parser/lexer-on_goal.rb +172 -0
  5. data/lib/sportdb/parser/lexer-on_group_def.rb +31 -0
  6. data/lib/sportdb/parser/lexer-on_prop_lineup.rb +79 -0
  7. data/lib/sportdb/parser/lexer-on_prop_misc.rb +110 -0
  8. data/lib/sportdb/parser/lexer-on_prop_penalties.rb +40 -0
  9. data/lib/sportdb/parser/lexer-on_round_def.rb +37 -0
  10. data/lib/sportdb/parser/lexer-on_top.rb +125 -0
  11. data/lib/sportdb/parser/lexer-prep_doc.rb +131 -0
  12. data/lib/sportdb/parser/lexer-prep_line.rb +63 -0
  13. data/lib/sportdb/parser/lexer-tokenize.rb +449 -0
  14. data/lib/sportdb/parser/lexer.rb +133 -1363
  15. data/lib/sportdb/parser/lexer_buffer.rb +8 -37
  16. data/lib/sportdb/parser/lexer_token.rb +126 -0
  17. data/lib/sportdb/parser/parser.rb +1104 -1403
  18. data/lib/sportdb/parser/racc_parser.rb +36 -32
  19. data/lib/sportdb/parser/racc_tree.rb +65 -98
  20. data/lib/sportdb/parser/token-date--helpers.rb +130 -0
  21. data/lib/sportdb/parser/token-date--names.rb +108 -0
  22. data/lib/sportdb/parser/token-date.rb +20 -192
  23. data/lib/sportdb/parser/token-date_duration.rb +8 -27
  24. data/lib/sportdb/parser/token-geo.rb +16 -16
  25. data/lib/sportdb/parser/token-goals--helpers.rb +114 -0
  26. data/lib/sportdb/parser/token-goals.rb +103 -249
  27. data/lib/sportdb/parser/token-group.rb +8 -22
  28. data/lib/sportdb/parser/token-prop.rb +138 -124
  29. data/lib/sportdb/parser/token-prop_name.rb +48 -39
  30. data/lib/sportdb/parser/token-round.rb +21 -35
  31. data/lib/sportdb/parser/token-score--helpers.rb +189 -0
  32. data/lib/sportdb/parser/token-score.rb +9 -393
  33. data/lib/sportdb/parser/token-score_full.rb +331 -0
  34. data/lib/sportdb/parser/token-status.rb +44 -46
  35. data/lib/sportdb/parser/token-status_inline.rb +112 -0
  36. data/lib/sportdb/parser/token-text.rb +41 -31
  37. data/lib/sportdb/parser/token-time.rb +29 -26
  38. data/lib/sportdb/parser/token.rb +58 -159
  39. data/lib/sportdb/parser/version.rb +1 -1
  40. data/lib/sportdb/parser.rb +45 -17
  41. metadata +19 -6
  42. data/lib/sportdb/parser/blocktxt.rb +0 -99
  43. data/lib/sportdb/parser/lexer_tty.rb +0 -111
  44. data/lib/sportdb/parser/token-table.rb +0 -149
  45. data/lib/sportdb/parser/token_helpers.rb +0 -92
@@ -7,6 +7,9 @@ class Lexer
7
7
  def log( msg )
8
8
  ## append msg to ./logs.txt
9
9
  ## use ./errors.txt - why? why not?
10
+ ##
11
+ ## change to ./logs_lexer.txt or such - why? why not?
12
+ ## auto-add/prepend [Lexer] and timestamp!!! to msg - why? why not?
10
13
  File.open( './logs.txt', 'a:utf-8' ) do |f|
11
14
  f.write( msg )
12
15
  f.write( "\n" )
@@ -14,387 +17,125 @@ def log( msg )
14
17
  end
15
18
 
16
19
 
17
- ###
18
- ## todo/fix - use LangHelper or such
19
- ## e.g. class Lexer
20
- ## include LangHelper
21
- ## end
22
- ##
23
- ## merge back Lang into Lexer - why? why not?
24
- ## keep "old" access to checking for group, round & friends
25
- ## for now for compatibility
26
- def is_group?( text ) Lang.is_group?( text ); end
27
- def is_round?( text ) Lang.is_round?( text ); end
28
-
20
+ def _trace( *args )
21
+ if debug?
22
+ print "[DEBUG] Lexer -- "
23
+ args.each { |arg| puts args }
24
+ end
25
+ end
26
+
27
+ def _warn( *args )
28
+ print "!! [WARN] Lexer -- "
29
+ args.each { |arg| puts args }
30
+ end
29
31
 
32
+ def _info( *args )
33
+ print "[INFO] Lexer -- "
34
+ args.each { |arg| puts args }
35
+ end
30
36
 
31
37
 
32
38
  def debug?() @debug == true; end
33
39
 
34
- def initialize( lines, debug: false )
35
- raise ArgumentError, "(string) text expected for lexer; got #{lines.class.name}" unless lines.is_a?(String)
36
-
37
- @debug = debug
38
- @txt = lines
39
- end
40
40
 
41
41
 
42
- HTML_COMMENT_RE = %r{ <!--
43
- .*? ## note - use non-greedy/lazy *? match
44
- -->
45
- }xm ## note - turn on multi-line match (for dot (.))
46
42
 
47
43
 
48
- ##
49
- ## note - [] block may NOT incl. square brackets
50
- ## what about comments (e.g. #)?
51
- ## todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
52
- PREPROC_BLOCK_RE = %r{ \[
53
- [^\[\]\#]*? ## note - use non-greedy/lazy *? match
54
- \]
55
- }xm ## note - turn on multi-line match (for dot(.))
44
+ def initialize( txt, debug: false )
45
+ raise ArgumentError, "text as string expected for lexer; got #{txt.class.name}" unless txt.is_a?(String)
56
46
 
47
+ @txt = txt
48
+ @debug = debug
49
+ end
57
50
 
58
- ##
59
- ## check for "literal" (multi-line) note blocks
60
- ## eg. nb: or note:
61
- ## space required after double colon - why? why not?
62
- PREPROC_NOTA_BENE_RE = %r{
63
- ^
64
- [ ]* (?: nb | note) [ ]* : [ ]+
65
- .+? ## non-greedy
66
-
67
- ## positive lookahead
68
- ## note - must end with blank line or end-of-file/document
69
- ## note - do NOT eat-up trailing hrule (---)
70
- (?= (?: \n [ ]* -{3,} [ ]*)?
71
- \n[ ]*\n
72
- | \z
73
- )
74
- }xim
75
51
 
76
- ##
77
- ## replace "escaped" newline with non-newline char e.g. '↵'
78
- LINE_CONTINUATION_RE = %r{
79
- \\[ ]* \n
80
- }x
81
52
 
82
53
 
54
+ def tokenize_with_errors
83
55
 
84
- ###
85
- ## check for magic comments
86
- ## e.g # teletype: true or TELETYPE: TRUE
87
- ## tty/teletype
56
+ tokens_by_line = [] ## note: add tokens line-by-line (flatten later)
57
+ errors = [] ## keep a list of errors - why? why not?
88
58
 
89
- MAGIC_COMMENT_RE = %r{ \A
90
- [ ]* ## optional leading spaces
91
- \#+ ## note - allow ##,###, etc. too
92
- [ ]* ## optional spaces
93
- (?<magic_comment_key> tty | teletype )
94
- [ ]* ## optional spaces
95
- :
96
- [ ]* ## optional spaces
97
- (?<magic_comment_value> true | false )
98
- [ ]* ## optional trailing spaces
99
- \z
100
- }ix
101
59
 
60
+ txt = _prep_doc( @txt )
102
61
 
103
62
 
104
63
 
64
+ ####
65
+ ## quick hack - keep re state/mode between tokenize calls!!!
66
+ @re ||= RE ## note - switch between RE & INSIDE_RE
105
67
 
68
+ lineno = 0
69
+ txt.each_line do |line|
70
+ lineno += 1
106
71
 
107
- def tokenize_with_errors
72
+ ## todo - "inlined virtual/collapsed/folded newlines"
73
+ ## check for "↵" !!!
74
+ ## and add to lineno
108
75
 
109
- ####
110
- ## flags / modes
111
- @teletype = false # use magic comment - tty/teletype: true
112
76
 
77
+ ## note - KEEP leading spaces for indent
78
+ ## use rstrip (NOT left/leading & right/trainling strip) only!!
79
+ ## note - remove/strip trailing newline (and optional spaces)!!!
80
+ ## trailing whitespace may incl. \n or \r\n!!!
81
+ line = line.rstrip
113
82
 
114
83
 
115
- tokens_by_line = [] ## note: add tokens line-by-line (flatten later)
116
- errors = [] ## keep a list of errors - why? why not?
117
-
118
- ## preprocess automagically - why? why not?
119
- ## strip lines with comments and empty lines striped / removed
120
- ## keep empty lines? why? why not?
121
- ## keep leading spaces (indent) - why?
122
- ##
123
- ## note - KEEP empty lines (get turned into BLANK token!!!!)
124
-
125
-
126
- ## "universal" newlines
127
- ## replace all windows-style cr+lf (\r\n) to lf (\n) only
128
- txt = @txt.gsub( "\r\n", "\n" )
129
-
130
-
131
-
132
- ###
133
- ## quick hack for now
134
- ## remove html-style comments <!-- -->
135
- ## (incl. multi-line) with two spaces
136
- ## will mess-up lineno tracking!!!
137
- ## fix later to have function lineno & colno!!!
138
- txt = @txt.gsub( HTML_COMMENT_RE ) do |m|
139
- puts " [debug] preproc html comment:"
140
- puts m
141
- ' '
142
- end
143
-
144
-
145
- =begin
146
- ##
147
- ## todo/fix - add a command line switch/option for auto-format fixes !!!
148
- ## quick hack - remove later
149
- ## auto-convert "old" legacy round markers (»)
150
- txt = txt.gsub( %r{^ [ ]*
151
- »
152
- (?= [ ]+) ## require one trailing space for now!!
153
- }ix ) do |_|
154
- puts "!! WARN - auto-fix format; replacing old (alternate/legacy) round marker (»)"
155
- '▪'
156
- end
157
-
158
-
159
- ### 16.00 => 16:00
160
- ## todo/check - use space for positive lookbehind & ahead
161
- ## (instead of \b) - why? why not?
162
- ## note - check for/exclude 12.12. date in match
163
- ## use negative lookahead
164
- ## check for 12.12.94
165
- ## use positive lookbehind !!!
166
- ## must be space, comma or begin-of-line [ ,]|^
167
- ## or use negative lookbehind
168
- ## must NOT be dot
169
- txt = txt.gsub( %r{
170
- ## check NEGATIVE lookbehind
171
- (?<! [.]) ## do NOT match 12.94 in 12.12.94
172
- \b
173
- (?<h>\d{1,2})
174
- \.
175
- (?<m>\d{2})
176
- \b
177
- (?! [.] ) ## do NOT match 12.12.
178
- }ix ) do |_|
179
- m = $~ ## is $LAST_MATCH_DATA
180
- puts "!! WARN - auto-fix format; replacing old (alternate/legacy) time format #{m[0]}"
181
- "#{m[:h]}:#{m[:m]}" ## '\1:\2'
182
- end
183
- =end
184
-
185
-
186
-
187
-
188
- ###
189
- ## add more "native" multi-line comment-styles
190
- ## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >>
191
- ## or such - why? why not?
192
-
193
-
194
- txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
195
- if m.include?( "\n" ) ## check for newlines (\n) and replace
196
- puts " [debug] preproc (multi-line) note/nota bene block:"
197
- puts m
198
- ## todo/check: replace with two spaces insead of ↵ - why? why not?
199
- m.gsub( "\n", '↵' )
200
- else
201
- m
202
- end
203
- end
84
+ ### skip comments
85
+ ## todo/check - change to blank line
86
+ ## to keep lineno (closer to orginal) - why? why not?
87
+ next if line.match?(/\A [ ]* ## optional leading space(s)
88
+ \#
89
+ /x )
204
90
 
91
+ ## strip (inline) end-of-line comments (from line)
92
+ ## check/discuss: make - inline comment require trailing space
93
+ ## e.g. #1 vs # 1 - why? why not?
94
+ line = line.sub( / [ ]* ## (eat-up) optional leading space(s)
95
+ \#{1,}.*?
96
+ \z
97
+ /x, '' )
205
98
 
206
- ##
207
- ## e.g. used in (multi-line) TableNote
208
- ## 1.SOUTH KOREA 6 5 1 0 22- 1 16 [0-0]
209
- ## 2.LEBANON 6 3 1 2 11- 8 10 [0-2, 0-0]
210
- ## 3.Turkmenistan 6 3 0 3 8-11 9 [3-1]
211
- ## 4.Sri Lanka 6 0 0 6 2-23 0 [0-1]
212
- ## -.North Korea [withdrew after playing 5 matches due to safety concerns in
213
- ## connection with the Covid-19 pandemic; all results annulled]
214
- ##
215
- ## note - no longer used for now
216
- ## enclose multi-line notes in []
217
- ## removes need for line continuation for now
218
-
219
- ##
220
- ## txt = txt.gsub( LINE_CONTINUATION_RE ) do |_|
221
- ## puts " [debug] preproc line continuation"
222
- ## ## todo/check: replace with two spaces insead of ↵ - why? why not?
223
- ## '↵'
224
- ## end
225
-
226
-
227
-
228
- #####
229
- ## (another) quick hack for now
230
- ## turn multi-line note blocks into
231
- ## single-line note blocks
232
- ## by changing newline (\n) to ⏎ (unicode U+23CE)
233
- ## or why not to ___ ?
234
- ##
235
- ## unicode options for return/arrows:
236
- ## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
237
- ## This is the most common "carriage return" symbol.
238
- ## - ⏎ (U+23CE): Return Symbol.
239
- ## Specifically designated as the keyboard's "Return" key symbol,
240
- ## often used in user interfaces.
241
-
242
- txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
243
- if m.include?( "\n" ) ## check for newlines (\n) and replace
244
- puts " [debug] preproc (multi-line) block:"
245
- puts m
246
- ## todo/check: replace with two spaces insead of ↵ - why? why not?
247
- m.gsub( "\n", '↵' )
248
- else
249
- m
250
- end
251
- end
252
99
 
100
+ ####
101
+ # support __END__ marker to cut-off input
102
+ break if line.match?( /\A [ ]* ## optional leading space(s)
103
+ __END__
104
+ \z
105
+ /x )
253
106
 
254
- ####
255
- ## quick hack - keep re state/mode between tokenize calls!!!
256
- @re ||= RE ## note - switch between RE & INSIDE_RE
257
-
258
107
 
259
- txt.each_line do |line|
260
- ## line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!!
261
- line = line.strip ## note - strip leading AND trailing whitespaces
262
- ## note - trailing whitespace may incl. \n or \r\n!!!
263
-
264
-
265
- ##
266
- ###
267
- ## check for magic comments
268
- ## e.g # teletype: true or TELETYPE: TRUE
269
- ## tty/teletype
270
-
271
- if line.start_with?('#') ### skip comments (& check magic comments!!)
272
-
273
- if (m = MAGIC_COMMENT_RE.match(line))
274
- magic_comment_key = m[:magic_comment_key].downcase
275
- magic_comment_value = m[:magic_comment_value].downcase
276
-
277
- ## turn on teletype mode
278
- ## e.g. tty: true or teletype: true
279
- if ['tty', 'teletype'].include?( magic_comment_key ) &&
280
- ['true'].include?( magic_comment_value )
281
- puts " magic comment - turn on teletype (tty) mode"
282
- @teletype = true
283
- end
284
- end
285
-
286
- next
287
- end
288
108
 
289
- line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
109
+ ## auto-fixes line-by-line (e.g. check for tabs, smart quotes, etc.)
110
+ line = _prep_line( line )
290
111
 
291
112
 
292
- ####
293
- # support __END__ marker to cut-off input
294
- break if line.strip == '__END__'
295
-
296
-
297
-
298
- ##
299
- ## first check for tabs
300
- ## add error/warn
301
- ## for auto-fix - replace tabs with two spaces
302
-
303
- line = line.gsub( "\t" ) do |_|
304
- ## report error here
305
- ## todo/add error here
306
- puts "!! WARN - auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}"
307
- " " ## replace with two spaces
308
- end
309
-
310
-
311
- ## U+00A0 (160) -- non-breaking space (unicode)
312
- line = line.gsub( "\u00A0" ) do |uni|
313
- ## report error here
314
- ## todo/add error here
315
- puts "!! WARN - auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}"
316
- " " ## replace with space
317
- end
318
-
319
- ###
320
- ## todo/fix - print unicode numbers for [–−]
321
- ## different candidates to differentiate and document!!!
322
- ## – => U+2013 (8211) -- En Dash (unicode)
323
- ## − => U+2212 (8722) -- Minus Sign (unicode)
324
- line = line.gsub( /[–−]/ ) do |uni|
325
- ## report error here
326
- ## todo/add error here
327
- puts "!! WARN - auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}"
328
- '-' ## replace with ascii dash (-)
329
- end
330
-
331
-
332
-
333
- puts "line: >#{line}<" if debug?
113
+ _trace( "line #{lineno}: >#{line}<" )
114
+
334
115
 
335
116
  ######
336
117
  ### special case for empty line (aka BLANK)
337
118
  if line.empty?
338
119
  ## note - blank always resets parser mode to std/top-level!!!
339
120
  @re = RE
340
- tokens_by_line << [[:BLANK, '<|BLANK|>']]
121
+ tokens_by_line << [Token.virtual(:BLANK, lineno: lineno)]
341
122
  elsif (m = HEADING_RE.match(line))
342
123
  ## note - heading always resets parser mode to std/top-level!!!
343
124
  @re = RE
344
- puts " HEADING" if debug?
125
+ _trace( 'HEADING' )
345
126
  ## note - derive heading level from no of (leading) markers
346
127
  ## e.g. = is 1, == is 2, == is 3, etc.
347
- heading_level = m[:heading_marker].size
348
- tokens_by_line << [[:"H#{heading_level}", m[:heading]]]
128
+ heading_level = m[:heading_marker].size
129
+ tokens_by_line << [Token.new(:"H#{heading_level}", m[:heading], lineno: lineno)]
349
130
  elsif (m = NOTA_BENE_RE.match(line))
350
131
  ## note - nota bene always resets parser mode to std/top-level!!!
351
132
  @re = RE
352
- tokens_by_line << [[:NOTA_BENE, m[:nota_bene]]]
353
- elsif @re == RE && (m = TABLE_RE.match(line))
354
- @re = TABLE_MORE_RE ## switch into table mode
355
- if m[:table_heading]
356
- tokens_by_line << [[:TABLE_HEADING, m[:table_heading]]]
357
- else ## assume table (line) e.g. m[:table]
358
- tokens_by_line << [[:TABLE_LINE, line]]
359
- end
360
- elsif @re == TABLE_MORE_RE
361
- ### todo/fix - check if no match and report/add error!!
362
- ## for now (ummatched) line gets auto-added as table line!!!
363
- ##
364
- ## note - MUST be followed by blank line (or nota bene/heading)
365
- ## to switch back into to top-level!!!!
366
- m = TABLE_MORE_RE.match(line)
367
- if m[:table_note]
368
- tokens_by_line << [[:TABLE_NOTE, m[:table_note]]]
369
- elsif m[:table_divider]
370
- tokens_by_line << [[:TABLE_DIVIDER, m[:table_divider]]]
371
- else ## assume table (line) e.g. m[:table]
372
- tokens_by_line << [[:TABLE_LINE, line]]
373
- end
374
- elsif @re != TABLE_MORE_RE && (m = HRULER_RE.match(line))
375
- ## note - hruler (---)
376
- ## will only match if NOT in table mode!!!
377
- ## otherwise
378
- ## hruler always resets parser mode to std/top-level!!!
379
- @re = RE
380
- tokens_by_line << [[:HRULER, '<|HRULER|>']]
381
- elsif @teletype && (@re == RE && IS_TTY_LINE_RE.match(line))
382
- ## try experimental TELETYPE (TTY) mode!!!
383
- ## note - turn on via magic comment e.g. tty/teletype: true
384
- ###
385
- ### move inside _tokenize_line - why? why not?
386
-
387
-
388
- tokens_by_line << _tokenize_tty_line( line )
389
-
390
- ## note - dates such as
391
- ## APR 11 or 11 APR will trigger TELETYPE
392
- ### ## check letter
133
+ tokens_by_line << [Token.new(:NOTA_BENE, m[:nota_bene], lineno: lineno)]
393
134
  else
394
135
 
395
- more_tokens, more_errors = _tokenize_line( line )
396
-
397
- tokens_by_line << more_tokens
136
+ more_tokens, more_errors = _tokenize_line( line, lineno )
137
+
138
+ tokens_by_line << more_tokens
398
139
  errors += more_errors
399
140
  end
400
141
  end # each line
@@ -402,1084 +143,113 @@ def tokenize_with_errors
402
143
 
403
144
 
404
145
 
146
+ tokens_by_line = tokens_by_line.map do |tokens|
405
147
 
406
- tokens_by_line = tokens_by_line.map do |tokens|
407
148
  #################
408
- ## transform tokens (using simple patterns)
409
- ## to help along the (racc look ahead 1 - LA1) parser
149
+ ## transform tokens (using simple patterns)
150
+ ## to help along the (racc look ahead 1 - LA1) parser
410
151
  nodes = []
411
152
 
412
153
  buf = Tokens.new( tokens )
413
154
  ## pp buf
414
155
 
156
+
415
157
  loop do
416
158
  break if buf.eos?
417
159
 
418
160
  if buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
419
- date = buf.next[1]
420
- time = buf.next[1]
161
+ date = buf.next
162
+ time = buf.next
421
163
  ## puts "DATETIME:"
422
164
  ## pp date, time
165
+
423
166
  ## note: time value is { time: {} } or
424
167
  ## { time: {}, time_local {} }
425
- val = [date[0] + ' ' + time[0], ## concat string of two tokens
426
- { date: date[1] }.merge( time[1] )
427
- ]
428
- nodes << [:DATETIME, val]
429
- ### support date time with comma too - why? why not?
430
- elsif buf.match?( :DATE, :',', :TIME )
431
- date = buf.next[1]
432
- _ = buf.next ## ignore comma
433
- time = buf.next[1]
168
+ text = date.text + ' ' + time.text, ## concat string of two tokens
169
+ value = { date: date.value }.merge( time.value )
170
+
171
+ nodes << Token.new(:DATETIME, text,
172
+ lineno: date.lineno,
173
+ offset: [date.offset[0],
174
+ time.offset[1]],
175
+ value: value )
176
+ ### support date time with comma too - why? why not?
177
+ elsif buf.match?( :DATE, ',', :TIME )
178
+ date = buf.next
179
+ _ = buf.next ## ignore comma
180
+ time = buf.next
434
181
  ## puts "DATETIME:"
435
182
  ## pp date, time
436
- val = [date[0] + ', ' + time[0], ## concat string of two tokens
437
- { date: date[1] }.merge( time[1] )
438
- ]
439
- nodes << [:DATETIME, val]
440
- elsif buf.match?( :TEAM, :SCORE_TEAM )
441
- ## merge TEAM SCORE_TEAM into TEAMALT
442
- ## (use TEAMENTRY or TEAMRESULT - why? why not?)
443
- team = buf.next[1]
444
- score_team = buf.next[1]
445
- val = [team + ' ' + score_team[0], ## concat string of two tokens
446
- { team: team }.merge( score_team[1] )
447
- ]
448
- nodes << [:TEAMALT, val]
449
- elsif buf.match?( :TEAM, :SCORE_TEAM_PEN )
450
- team = buf.next[1]
451
- score_team_pen = buf.next[1]
452
- val = [team + ' ' + score_team_pen[0], ## concat string of two tokens
453
- { team: team }.merge( score_team_pen[1] )
454
- ]
455
- nodes << [:TEAMALT_PEN, val]
456
- elsif buf.match?( :TEAM, :SCORE_TEAM_NUM )
457
- team = buf.next[1]
458
- score_team_num = buf.next[1]
459
- val = [team + ' ' + score_team_num[0], ## concat string of two tokens
460
- { team: team }.merge( score_team_num[1] )
461
- ]
462
- nodes << [:TEAMALT_NUM, val]
463
- elsif buf.match?( :GOAL_MINUTE, :',', :GOAL_MINUTE )
183
+ text = date.text + ', ' + time.text ## concat string of two tokens
184
+ value = { date: date.value }.merge( time.value )
185
+
186
+ nodes << Token.new(:DATETIME, text,
187
+ lineno: date.lineno,
188
+ offset: [date.offset[0],
189
+ time.offset[1]],
190
+ value: value )
191
+ elsif buf.match?( :GOAL_MINUTE, ',', :GOAL_MINUTE )
464
192
  ## note - only advance by two tokens!
465
193
  ## allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
466
- ##
194
+ ##
467
195
  ## help parser with comma shift/reduce conflict
468
196
  ## change ',' to GOAL_MINUTE_SEP !!!
469
- nodes << buf.next ## pass through goal_minute
470
- _ = buf.next ## eat-up goal_minute_sep a.k.a. comma (,)
197
+ nodes << buf.next ## pass through goal_minute
198
+ comma = buf.next ## eat-up goal_minute_sep a.k.a. comma (,)
471
199
  ## and replace with dedicated sep(arator)
472
- nodes << [:GOAL_MINUTE_SEP,"<|GOAL_MINUTE_SEP|>"]
473
- elsif buf.match?( :',', :INLINE_ATTENDANCE )
474
- ## note - allow optional comma before inline attendance
200
+ nodes << Token.new( :GOAL_MINUTE_SEP,
201
+ comma.text,
202
+ lineno: comma.lineno,
203
+ offset: comma.offset,
204
+ value: comma.value)
205
+ elsif buf.match?( ',', :INLINE_ATTENDANCE )
206
+ ## note - allow optional comma before inline attendance
475
207
  ## help parser with comma shift/reduce conflict
476
208
  ## change ',' to INLINE_ATTENDANCE_SEP !!!
477
- nodes << [:INLINE_ATTENDANCE_SEP, "<|INLINE_ATTENDANCE_SEP|>"]
478
- _ = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,)
209
+ comma = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,)
479
210
  ## and replace with dedicated sep(arator)
480
- nodes << buf.next ## pass through inline_attendance
211
+ nodes << Token.new(:INLINE_ATTENDANCE_SEP,
212
+ comma.text,
213
+ lineno: comma.lineno,
214
+ offset: comma.offset,
215
+ value: comma.value)
216
+ nodes << buf.next ## pass through inline_attendance
481
217
  else
482
218
  ## pass through
483
219
  nodes << buf.next
484
220
  end
485
221
  end # loop
486
- nodes
222
+ nodes
487
223
  end # map tokens_by_line
488
224
 
489
225
 
490
-
226
+ ## puts "tokens_by_line:"
227
+ ## pp tokens_by_line
228
+
491
229
 
492
230
  ## flatten tokens
493
231
  tokens = []
494
- tokens_by_line.each do |tok|
232
+ tokens_by_line.each do |tok_line|
495
233
 
496
- if debug?
497
- pp tok
498
- end
234
+ ## if debug?
235
+ ## pp tok_line
236
+ ## end
499
237
 
238
+ tokens += tok_line
500
239
 
501
- ###############
502
- ## "hacky" (automagic) line merges (remove newline)
503
- ## if line start with @ - check if incl. teams
504
-
505
- ###
506
- ### quick merge lines hack
507
- ## if line starts with geo-marker token @
508
- ## check if line incl. TEAM
509
- ## if yes, leave alone
510
- ## otherwise merge line into previous line!!
511
- ## - todo/fix - handle in possibly in grammar!!!
512
- ## for now match_line CAN start with @ London
513
- ## resulting in parser conflict(s)!!!
514
- ## e.g.
515
- ## England v Scotland
516
- ## @ London
517
- ## =>
518
- ## England v Scotland @ London
519
- ##
520
-
521
- ##
522
- ## note/todo - if INDENT / SPACES get added
523
- ## adjust here
524
- ## tok[0][0] == :INDENT (or :SPACES) &&
525
- ## tok[1][0] == :'@'
526
-
527
- if tok[0] && tok[0][0] == :'@'
528
- team = tok.find { |t| t[0] == :TEAM }
529
- if team
530
- ## do nothing - keep as is (assume match_line starting w/ @)
531
- else
532
- ## no team(s) found in line
533
- ## remove last token (that is, NEWLINE)
534
- ## note - possibly is blank ?! keep blank
535
- tokens.pop if tokens[-1][0] == :NEWLINE
536
- end
537
- end
538
-
539
-
540
- tokens += tok
541
240
  ## auto-add newlines (unless BLANK!!)
542
- tokens << [:NEWLINE, "\n"] unless tok[0] && tok[0][0] == :BLANK
241
+ unless tok_line[0] && tok_line[0].type == :BLANK
242
+ ## note - reuse lineno from first token in line
243
+ ## use last - why? why not?
244
+ tokens << Token.newline( lineno: tok_line[0].lineno )
245
+ end
543
246
  end
544
247
 
545
248
  [tokens,errors]
546
- end # method tokenize_with_errors
547
-
548
-
549
-
550
-
551
- def _tokenize_line( line )
552
- tokens = []
553
- errors = [] ## keep a list of errors - why? why not?
554
249
 
250
+ end # method tokenize_with_errors
555
251
 
556
- pos = 0
557
- ## track last offsets - to report error on no match
558
- ## or no match in end of string
559
- offsets = [0,0]
560
- m = nil
561
-
562
- ## track number of geo text seen
563
- ## (use for - do NOT break on two spaces if no geo text seen yet!!)
564
- geo_count = 0
565
-
566
- ####
567
- ## quick hack - keep re state/mode between tokenize calls!!!
568
- @re ||= RE ## note - switch between RE & INSIDE_RE
569
-
570
-
571
- if @re == RE ## top-level
572
- ### check for modes once (per line) here to speed-up parsing
573
- ### for now goals only possible for start of line!!
574
- ### fix - remove optional [] - why? why not?
575
-
576
- ####
577
- ## note - ord e.g. (45) for match number can only start a (match) line
578
- ## "inline" use NOT possible
579
- ## note - ord (for ordinal number!!!) e.g match number (1), (42), etc.
580
- if (m = START_WITH_ORD.match(line))
581
- ## note - strip enclosing () and convert to integer
582
- tokens << [:ORD, [m[:ord], { value: m[:value].to_i(10) } ]]
583
-
584
- offsets = [m.begin(0), m.end(0)]
585
- pos = offsets[1] ## update pos
586
- elsif (m = START_WITH_YEAR.match(line))
587
- ## note - strip enclosing () and convert to integer
588
- tokens << [:YEAR, m[:year].to_i(10)]
589
-
590
- offsets = [m.begin(0), m.end(0)]
591
- pos = offsets[1] ## update pos
592
-
593
- ###
594
- ## todo/fix - rename to START_GROUP_DEF_LINE_RE !!!!
595
- elsif (m = GROUP_DEF_LINE_RE.match( line ))
596
- puts " ENTER GROUP_DEF_RE MODE" if debug?
597
- @re = GROUP_DEF_RE
598
-
599
- tokens << [:GROUP_DEF, m[:group_def]]
600
-
601
- offsets = [m.begin(0), m.end(0)]
602
- pos = offsets[1] ## update pos
603
-
604
- ### todo/fix - rename to PROP_KEY_RE to START_WITH_PROP_KEY_RE !!!
605
- elsif (m = PROP_KEY_RE.match( line ))
606
- ## start with prop key (match will switch into prop mode!!!)
607
- ## - fix - remove leading spaces in regex (upstream) - why? why not?
608
- ##
609
- ### switch into new mode
610
- ## switch context to PROP_RE
611
- puts " ENTER PROP_RE MODE" if debug?
612
- key = m[:key]
613
-
614
-
615
- ### todo/fix - add prop yellow/red cards too - why? why not?
616
- ## todo/fix - separate sent off and red card
617
- ## sent-off - incl. red card, yellow/red card and the era before red cards!!
618
- if ['sent off'].include?( key.downcase)
619
- @re = PROP_CARDS_RE ## use CARDS_RE ???
620
- tokens << [:PROP_SENTOFF, m[:key]]
621
- elsif ['red cards'].include?( key.downcase )
622
- @re = PROP_CARDS_RE ## use CARDS_RE ???
623
- tokens << [:PROP_REDCARDS, m[:key]]
624
- elsif ['yellow cards'].include?( key.downcase )
625
- @re = PROP_CARDS_RE
626
- tokens << [:PROP_YELLOWCARDS, m[:key]]
627
- elsif ['ref', 'referee',
628
- 'refs', 'referees' ## note - allow/support assistant refs
629
- ].include?( key.downcase )
630
- @re = PROP_REFEREE_RE
631
- tokens << [:PROP_REFEREE, m[:key]]
632
- elsif ['att', 'attn', 'attendance'].include?( key.downcase )
633
- @re = PROP_ATTENDANCE_RE
634
- tokens << [:PROP_ATTENDANCE, m[:key]]
635
-
636
- # elsif ['goals'].include?( key.downcase )
637
- # @re = PROP_GOAL_RE
638
- # tokens << [:PROP_GOALS, m[:key]]
639
-
640
- elsif ['penalties',
641
- 'penalty shootout',
642
- 'penalty shoot-out',
643
- 'penalty kicks'].include?( key.downcase )
644
- @re = PROP_PENALTIES_RE
645
- tokens << [:PROP_PENALTIES, m[:key]]
646
- else ## assume (team) line-up
647
- @re = PROP_RE ## use LINEUP_RE ???
648
- tokens << [:PROP, m[:key]]
649
- end
650
-
651
- offsets = [m.begin(0), m.end(0)]
652
- pos = offsets[1] ## update pos
653
- ###
654
- ### todo/fix
655
- ### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
656
- elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
657
- puts " ENTER ROUND_DEF_RE MODE" if debug?
658
- @re = ROUND_DEF_RE
659
-
660
- ## note - return ROUND_DEF NOT ROUND_OUTLINE token
661
- tokens << [:ROUND_DEF, m[:round_outline]]
662
-
663
- offsets = [m.begin(0), m.end(0)]
664
- pos = offsets[1] ## update pos
665
- elsif (m = ROUND_OUTLINE_RE.match( line ))
666
- puts " ROUND_OUTLINE" if debug?
667
- ## note - derive round level from no of (leading) markers
668
- ## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
669
- ## note - ascii-style starts with double ::, thus, autodecrement by one!
670
- round_level = m[:round_marker].size
671
- round_level -= 1 if m[:round_marker].start_with?( '::' )
672
-
673
- tokens << [:ROUND_OUTLINE, [m[:round_outline],
674
- { outline: m[:round_outline] ,
675
- level: round_level}]]
676
-
677
- ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
678
- offsets = [m.begin(0), m.end(0)]
679
- pos = offsets[1] ## update pos
680
- elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume
681
- ## switch context to GOAL_RE (goalline(s))
682
- ####
683
- ## note - check for alternate goal line styles / formats
684
- if START_GOAL_LINE_COMPAT_RE.match(line )
685
- ## "legacy" style starting with minute e.g.
686
- ## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
687
- ## 84 Rahn 3-2)
688
- @re = GOAL_COMPAT_RE
689
- puts " ENTER GOAL_COMPAT_RE MODE" if debug?
690
-
691
- tokens << [:GOALS_COMPAT, "<|GOALS_COMPAT|>"]
692
- elsif START_GOAL_LINE_ALT_RE.match( line )
693
- ## goals with scores e.g.
694
- ## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
695
- ## -or-
696
- ## (Dion Beljo 1-0
697
- ## 1-1 Andreas Gruber
698
- ## Matthias Seidl 2-1)
699
- @re = GOAL_ALT_RE
700
- puts " ENTER GOAL_ALT_RE MODE" if debug?
701
-
702
- tokens << [:GOALS_ALT, "<|GOALS_ALT|>"]
703
- else
704
- ## "standard" / default style
705
- @re = GOAL_RE
706
- puts " ENTER GOAL_RE MODE" if debug?
707
-
708
- tokens << [:GOALS, "<|GOALS|>"]
709
- end
710
-
711
- ## note - eat-up ( for now
712
- ## pass along "virtual" GOALS or GOALS_ALT token
713
- ## (see INLINE_GOALS for the starting goal line inline)
714
- offsets = [m.begin(0), m.end(0)]
715
- pos = offsets[1] ## update pos
716
- end
717
- end
718
-
719
-
720
-
721
- old_pos = -1 ## allows to backtrack to old pos (used in geo)
722
-
723
- while m = @re.match( line, pos )
724
- # if debug?
725
- # pp m
726
- # puts "pos: #{pos}"
727
- # end
728
- offsets = [m.begin(0), m.end(0)]
729
-
730
- if offsets[0] != pos
731
- ## match NOT starting at start/begin position!!!
732
- ## report parse error!!!
733
- msg = "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
734
- puts msg
735
-
736
- errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
737
- log( msg )
738
- end
739
-
740
-
741
- ##
742
- ## todo/fix - also check if possible
743
- ## if no match but not yet end off string!!!!
744
- ## report skipped text run too!!!
745
-
746
- old_pos = pos
747
- pos = offsets[1]
748
-
749
- # pp offsets if debug?
750
-
751
- ##
752
- ## note: racc requires pairs e.g. [:TOKEN, VAL]
753
- ## for VAL use "text" or ["text", { opts }] array
754
-
755
-
756
- t = if @re == ROUND_DEF_RE
757
- if m[:spaces] || m[:space]
758
- nil ## skip spaces
759
- elsif m[:date]
760
- [:DATE, [m[:date], _build_date( m )]]
761
- elsif m[:duration]
762
- [:DURATION, [m[:duration], _build_duration( m )]]
763
- elsif m[:sym]
764
- sym = m[:sym]
765
- case sym
766
- when '|' then [:'|']
767
- when ':' then [:':']
768
- when ',' then [:',']
769
- else
770
- puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
771
- nil ## ignore others (e.g. brackets [])
772
- end
773
- elsif m[:any]
774
- ## todo/check log error
775
- msg = "parse error (tokenize round_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
776
- puts "!! WARN - #{msg}"
777
-
778
- errors << msg
779
- log( "!! WARN - #{msg}" )
780
-
781
- nil
782
- else
783
- ## report error/raise expection
784
- puts "!!! TOKENIZE ERROR - no match found"
785
- nil
786
- end
787
- elsif @re == GROUP_DEF_RE
788
- if m[:spaces] || m[:space]
789
- nil ## skip spaces
790
- elsif m[:text]
791
- [:TEAM, m[:text]]
792
- elsif m[:sym]
793
- sym = m[:sym]
794
- case sym
795
- when '|' then [:'|']
796
- when ':' then [:':']
797
- when ',' then [:',']
798
- else
799
- puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
800
- nil ## ignore others (e.g. brackets [])
801
- end
802
- elsif m[:any]
803
- ## todo/check log error
804
- msg = "parse error (tokenize group_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
805
- puts "!! WARN - #{msg}"
806
-
807
- errors << msg
808
- log( "!! WARN - #{msg}" )
809
-
810
- nil
811
- else
812
- ## report error/raise expection
813
- puts "!!! TOKENIZE ERROR - no match found"
814
- nil
815
- end
816
- elsif @re == GEO_RE
817
- ### note - possibly end inline geo on [ (and others?? in the future
818
- ## note: break on double spaces e.g.
819
- ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England
820
- if m[:spaces]
821
- ### note - do NOT break out
822
- ## if not text seen yet!!!
823
- if geo_count > 0
824
- ## get out-off geo mode and backtrack (w/ next)
825
- puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
826
- @re = RE
827
- pos = old_pos
828
- next ## backtrack (resume new loop step)
829
- else
830
- nil ## skip spaces
831
- end
832
- elsif m[:space]
833
- nil ## skip (single) space
834
- elsif m[:text]
835
- geo_count += 1
836
- [:GEO, m[:text]] ## keep pos - why? why not?
837
- elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!!
838
- ## get out-off geo mode and backtrack (w/ next)
839
- puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
840
- @re = RE
841
- pos = old_pos
842
- next ## backtrack (resume new loop step)
843
- elsif m[:sym]
844
- sym = m[:sym]
845
- ## return symbols "inline" as is - why? why not?
846
- ## (?<sym>[;,@|\[\]-])
847
- case sym
848
- ## note - reset geo_count to 0 (avoids break on two spaces)
849
- ## if separator seen!!
850
- when ',' then geo_count = 0; [:',']
851
- when '›' then geo_count = 0; [:','] ## note - treat geo sep › (unicode) like comma for now!!!
852
- when '>' then geo_count = 0; [:','] ## note - treat geo sep > (ascii) like comma for now!!!
853
- when '[' then
854
- ## get out-off geo mode and backtrack (w/ next)
855
- puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
856
- @re = RE
857
- pos = old_pos
858
- next ## backtrack (resume new loop step)
859
- else
860
- puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
861
- nil ## ignore others (e.g. brackets [])
862
- end
863
- elsif m[:any]
864
- ## todo/check log error
865
- msg = "parse error (tokenize geo) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
866
- puts "!! WARN - #{msg}"
867
-
868
- errors << msg
869
- log( "!! WARN - #{msg}" )
870
-
871
- nil
872
- else
873
- ## report error/raise expection
874
- puts "!!! TOKENIZE ERROR - no match found"
875
- nil
876
- end
877
- elsif @re == PROP_CARDS_RE
878
- if m[:space] || m[:spaces]
879
- nil ## skip space(s)
880
- elsif m[:prop_name]
881
- [:PROP_NAME, m[:name]]
882
- elsif m[:minute]
883
- minute = {}
884
- minute[:m] = m[:value].to_i(10)
885
- minute[:offset] = m[:value2].to_i(10) if m[:value2]
886
- ## note - for debugging keep (pass along) "literal" minute
887
- [:MINUTE, [m[:minute], minute]]
888
- elsif m[:sym]
889
- sym = m[:sym]
890
- case sym
891
- when ',' then [:',']
892
- when ';' then [:';']
893
- when '-' then [:'-']
894
- else
895
- nil ## ignore others (e.g. brackets [])
896
- end
897
- else
898
- ## report error
899
- puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found"
900
- nil
901
- end
902
- elsif @re == PROP_RE ### todo/fix - change to LINEUP_RE !!!!
903
- if m[:space] || m[:spaces]
904
- nil ## skip space(s)
905
- elsif m[:prop_key] ## check for inline prop keys
906
- key = m[:key]
907
- ## supported for now coach/trainer (add manager?)
908
- if ['coach',
909
- 'trainer'].include?( key.downcase )
910
- [:COACH, m[:key]] ## use COACH_KEY or such - why? why not?
911
- else
912
- ## report error - for unknown (inline) prop key in lineup
913
- nil
914
- end
915
- elsif m[:inline_captain]
916
- [:INLINE_CAPTAIN, m[:inline_captain]]
917
- elsif m[:inline_yellow]
918
- card = {}
919
- card[:m] = m[:minute].to_i(10) if m[:minute]
920
- card[:offset] = m[:offset].to_i(10) if m[:offset]
921
- [:INLINE_YELLOW, [m[:inline_yellow], card]]
922
- elsif m[:inline_red]
923
- card = {}
924
- card[:m] = m[:minute].to_i(10) if m[:minute]
925
- card[:offset] = m[:offset].to_i(10) if m[:offset]
926
- [:INLINE_RED, [m[:inline_red], card]]
927
- elsif m[:inline_yellow_red]
928
- card = {}
929
- card[:m] = m[:minute].to_i(10) if m[:minute]
930
- card[:offset] = m[:offset].to_i(10) if m[:offset]
931
- [:INLINE_YELLOW_RED, [m[:inline_yellow_red], card]]
932
- elsif m[:prop_name]
933
- [:PROP_NAME, m[:name]]
934
- elsif m[:minute]
935
- minute = {}
936
- minute[:m] = m[:value].to_i(10)
937
- minute[:offset] = m[:value2].to_i(10) if m[:value2]
938
- [:MINUTE, [m[:minute], minute]]
939
- elsif m[:sym]
940
- sym = m[:sym]
941
- ## return symbols "inline" as is - why? why not?
942
- ## (?<sym>[;,@|\[\]-])
943
-
944
- case sym
945
- when ',' then [:',']
946
- when ';' then [:';']
947
- when '[' then [:'[']
948
- when ']' then [:']']
949
- when '(' then [:'(']
950
- when ')' then [:')']
951
- when '-' then [:'-']
952
- else
953
- nil ## ignore others (e.g. brackets [])
954
- end
955
- else
956
- ## report error
957
- puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
958
- nil
959
- end
960
- elsif @re == PROP_ATTENDANCE_RE
961
- if m[:space] || m[:spaces]
962
- nil ## skip space(s)
963
- elsif m[:enclosed_name]
964
- ## reserverd for use for sold out or such (in the future) - why? why not?
965
- [:ENCLOSED_NAME, m[:name]]
966
- elsif m[:num]
967
- [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
968
- =begin
969
- elsif m[:sym]
970
- sym = m[:sym]
971
- case sym
972
- when ',' then [:',']
973
- when ';' then [:';']
974
- # when '[' then [:'[']
975
- # when ']' then [:']']
976
- else
977
- nil ## ignore others (e.g. brackets [])
978
- end
979
- =end
980
- else
981
- ## report error
982
- puts "!!! TOKENIZE ERROR (PROP_ATTENDANCE_RE) - no match found"
983
- nil
984
- end
985
- elsif @re == PROP_REFEREE_RE
986
- if m[:space] || m[:spaces]
987
- nil ## skip space(s)
988
- elsif m[:prop_key] ## check for inline prop keys
989
- key = m[:key]
990
- ## supported for now coach/trainer (add manager?)
991
- if ['att', 'attn', 'attendance' ].include?( key.downcase )
992
- [:ATTENDANCE, m[:key]] ## use COACH_KEY or such - why? why not?
993
- else
994
- ## report error - for unknown (inline) prop key in lineup
995
- nil
996
- end
997
- elsif m[:prop_name] ## note - change prop_name to player
998
- [:PROP_NAME, m[:name]] ### use PLAYER for token - why? why not?
999
- elsif m[:num]
1000
- [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
1001
- elsif m[:enclosed_name]
1002
- ## use HOLD,SAVE,POST or such keys - why? why not?
1003
- [:ENCLOSED_NAME, m[:name]]
1004
- elsif m[:sym]
1005
- sym = m[:sym]
1006
- case sym
1007
- when ',' then [:',']
1008
- when ';' then [:';']
1009
- # when '[' then [:'[']
1010
- # when ']' then [:']']
1011
- else
1012
- nil ## ignore others (e.g. brackets [])
1013
- end
1014
- else
1015
- ## report error
1016
- puts "!!! TOKENIZE ERROR (PROP_REFEREE_RE) - no match found"
1017
- nil
1018
- end
1019
- elsif @re == PROP_PENALTIES_RE
1020
- if m[:space] || m[:spaces]
1021
- nil ## skip space(s)
1022
- elsif m[:prop_name] ## note - change prop_name to player
1023
- [:PROP_NAME, m[:name]] ### use PLAYER for token - why? why not?
1024
- elsif m[:enclosed_name]
1025
- ## use HOLD,SAVE,POST or such keys - why? why not?
1026
- [:ENCLOSED_NAME, m[:name]]
1027
- elsif m[:score]
1028
- score = {}
1029
- ## must always have ft for now e.g. 1-1 or such
1030
- ### change to (generic) score from ft -
1031
- ## might be score a.e.t. or such - why? why not?
1032
- score[:score] = [m[:score1].to_i(10),
1033
- m[:score2].to_i(10)]
1034
- [:SCORE, [m[:score], score]]
1035
- elsif m[:sym]
1036
- sym = m[:sym]
1037
- case sym
1038
- when ',' then [:',']
1039
- when ';' then [:';']
1040
- when '[' then [:'[']
1041
- when ']' then [:']']
1042
- else
1043
- nil ## ignore others (e.g. brackets [])
1044
- end
1045
- else
1046
- ## report error
1047
- puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
1048
- nil
1049
- end
1050
- elsif @re == GOAL_COMPAT_RE
1051
- if m[:space] || m[:spaces]
1052
- nil ## skip space(s)
1053
- elsif m[:prop_name] ## note - change prop_name to player
1054
- [:PLAYER, m[:name]]
1055
- elsif m[:minute]
1056
- minute = _build_minute( m )
1057
- [:MINUTE, [m[:minute], minute]]
1058
- elsif m[:goal_type]
1059
- goal_type = _build_goal_type( m )
1060
- [:GOAL_TYPE, [m[:goal_type], goal_type]]
1061
- elsif m[:score]
1062
- score = {}
1063
- ## note - score is "generic"
1064
- ## might be full-time (ft) or
1065
- ## after extra-time (aet) or such
1066
- ## or even undecided/unknown
1067
- ## thus, use score1/score2 and NOT ft1/ft2
1068
- score[:score] = [m[:score1].to_i(10),
1069
- m[:score2].to_i(10)]
1070
- ## note - for debugging keep (pass along) "literal" score
1071
- [:SCORE, [m[:score], score]]
1072
- elsif m[:sym]
1073
- sym = m[:sym]
1074
- ## return symbols "inline" as is - why? why not?
1075
- ## (?<sym>[;,@|\[\]-])
1076
-
1077
- case sym
1078
- when ',' then [:',']
1079
- when ')' ## leave goal mode!!
1080
- puts " LEAVE GOAL_COMPAT_RE MODE" if debug?
1081
- @re = RE
1082
- ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
1083
- ## or GOAL_PAREN_CLOSE/END ???
1084
- [:GOALS_END, '<|GOALS_END|>']
1085
- else
1086
- nil ## ignore others (e.g. brackets [])
1087
- end
1088
- else
1089
- ## report error
1090
- puts "!!! TOKENIZE ERROR (GOAL_COMPAT_RE) - no match found"
1091
- nil
1092
- end
1093
- elsif @re == GOAL_ALT_RE
1094
- if m[:space] || m[:spaces]
1095
- nil ## skip space(s)
1096
- elsif m[:prop_name] ## note - change prop_name to player
1097
- [:PLAYER, m[:name]]
1098
- elsif m[:goal_minute]
1099
- minute = _build_goal_minute( m )
1100
- [:GOAL_MINUTE, [m[:goal_minute], minute]]
1101
- elsif m[:goal_type]
1102
- goal_type = _build_goal_type( m )
1103
- [:GOAL_TYPE, [m[:goal_type], goal_type]]
1104
- elsif m[:score]
1105
- score = {}
1106
- ## note - score is "generic"
1107
- ## might be full-time (ft) or
1108
- ## after extra-time (aet) or such
1109
- ## or even undecided/unknown
1110
- ## thus, use score1/score2 and NOT ft1/ft2
1111
- score[:score] = [m[:score1].to_i(10),
1112
- m[:score2].to_i(10)]
1113
- ## note - for debugging keep (pass along) "literal" score
1114
- [:SCORE, [m[:score], score]]
1115
- elsif m[:sym]
1116
- sym = m[:sym]
1117
- ## return symbols "inline" as is - why? why not?
1118
- ## (?<sym>[;,@|\[\]-])
1119
-
1120
- case sym
1121
- when ',' then [:',']
1122
- when ')' ## leave goal mode!!
1123
- puts " LEAVE GOAL_ALT_RE MODE" if debug?
1124
- @re = RE
1125
- ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
1126
- ## or GOAL_PAREN_CLOSE/END ???
1127
- [:GOALS_END, '<|GOALS_END|>']
1128
- else
1129
- nil ## ignore others (e.g. brackets [])
1130
- end
1131
- else
1132
- ## report error
1133
- puts "!!! TOKENIZE ERROR (GOAL_ALT_RE) - no match found"
1134
- nil
1135
- end
1136
- elsif @re == GOAL_RE
1137
- if m[:space] || m[:spaces]
1138
- nil ## skip space(s)
1139
- elsif m[:goals_none] ## note - eats-up semicolon!! e.g. -; or - ;
1140
- [:GOALS_NONE, "<|GOALS_NONE|>"]
1141
- elsif m[:goal_sep_alt]
1142
- [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ] ## e.g. dash (-) WITH leading & trailing space required
1143
- elsif m[:prop_name] ## note - change prop_name to player
1144
- [:PLAYER, m[:name]]
1145
- elsif m[:goal_minute]
1146
- minute = _build_goal_minute( m )
1147
- [:GOAL_MINUTE, [m[:goal_minute], minute]]
1148
- elsif m[:goal_count]
1149
- count = _build_goal_count( m )
1150
- [:GOAL_COUNT, [m[:goal_count], count]]
1151
- elsif m[:sym]
1152
- sym = m[:sym]
1153
- ## return symbols "inline" as is - why? why not?
1154
- ## (?<sym>[;,@|\[\]-])
1155
-
1156
- case sym
1157
- when ',' then [:',']
1158
- when ';' then [:';']
1159
- # when '[' then [:'[']
1160
- # when ']' then [:']']
1161
- when ')' ## leave goal mode!!
1162
- puts " LEAVE GOAL_RE MODE" if debug?
1163
- @re = RE
1164
- ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
1165
- ## or GOAL_PAREN_CLOSE/END ???
1166
- [:GOALS_END, '<|GOALS_END|>']
1167
- else
1168
- nil ## ignore others (e.g. brackets [])
1169
- end
1170
- else
1171
- ## report error
1172
- puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
1173
- nil
1174
- end
1175
- ###################################################
1176
- ## assume TOP_LEVEL (a.k.a. RE) machinery
1177
- else
1178
- if m[:space] || m[:spaces]
1179
- nil ## skip space(s)
1180
- elsif m[:text]
1181
- ## note - top-level (for now always) assumes TEAM for TEXT match!!
1182
- [:TEAM, m[:text]] ## keep pos - why? why not?
1183
- elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
1184
- [:STATUS, [m[:status], _build_status( m ) ]]
1185
- elsif m[:inline_wo] ## w/o - walkover (match status)
1186
- [:INLINE_WO, m[:inline_wo]]
1187
- elsif m[:inline_np] ## n/p - not played (match status)
1188
- [:INLINE_NP, m[:inline_np]]
1189
- elsif m[:inline_bye] ## bye (match status)
1190
- [:INLINE_BYE, m[:inline_bye]]
1191
- elsif m[:inline_abd] ## abd/abd. - abandoned (match status)
1192
- [:INLINE_ABD, m[:inline_abd]]
1193
- elsif m[:inline_void] ## abd/abd. - abandoned (match status)
1194
- [:INLINE_VOID, m[:inline_void]]
1195
- elsif m[:inline_susp] ## susp/susp. - suspended (match status)
1196
- [:INLINE_SUSP, m[:inline_susp]]
1197
- elsif m[:inline_ppd] ## ppd/ppd. or postp/postp. - postponed (match status)
1198
- [:INLINE_PPD, m[:inline_ppd]]
1199
- elsif m[:inline_awd] ## awd/awd. - awarded (match status)
1200
- [:INLINE_AWD, m[:inline_awd]]
1201
- elsif m[:inline_canc] ## canc/canc. - cancelled/canceled (match status)
1202
- [:INLINE_CANC, m[:inline_canc]]
1203
-
1204
- elsif m[:team_home]
1205
- [:TEAM_HOME, m[:team_home]]
1206
- elsif m[:team_away]
1207
- [:TEAM_AWAY, m[:team_away]]
1208
- elsif m[:team_neutral]
1209
- [:TEAM_NEUTRAL, m[:team_neutral]]
1210
-
1211
- elsif m[:attendance]
1212
- att = {}
1213
- att[:value] = m[:value].gsub( '_', '' ).to_i(10)
1214
- ## note - for token id use INLINE_ATTENDANCE (ATTENDANCE in use for prop!!!)
1215
- [:INLINE_ATTENDANCE, [m[:attendance], att ]]
1216
- elsif m[:note]
1217
- ### todo/check:
1218
- ## use value hash - why? why not? or simplify to:
1219
- ## [:NOTE, [m[:note], {note: m[:note] } ]]
1220
- [:NOTE, m[:note]]
1221
- elsif m[:time]
1222
- [:TIME, [m[:time], _build_time(m)]]
1223
- elsif m[:date]
1224
- [:DATE, [m[:date], _build_date(m)]]
1225
- elsif m[:date_legs]
1226
- [:DATE_LEGS, [m[:date_legs], _build_date_legs(m)]]
1227
- elsif m[:score_team]
1228
- [:SCORE_TEAM, [m[:score_team], _build_score_team(m)]]
1229
- elsif m[:score_team_pen]
1230
- [:SCORE_TEAM_PEN, [m[:score_team_pen], _build_score_team_pen(m)]]
1231
- elsif m[:score_team_num]
1232
- [:SCORE_TEAM_NUM, [m[:score_team_num], _build_score_team_num(m)]]
1233
- elsif m[:score_legs]
1234
- legs = {}
1235
-
1236
- ### leg1
1237
- score = {}
1238
- score[:ft] = [m[:leg1_ft1].to_i(10),
1239
- m[:leg1_ft2].to_i(10)]
1240
- legs['leg1'] = score
1241
-
1242
- ### leg2
1243
- score = {}
1244
- score[:ft] = [m[:leg2_ft1].to_i(10),
1245
- m[:leg2_ft2].to_i(10)] if m[:leg2_ft1] && m[:leg2_ft2]
1246
- score[:et] = [m[:leg2_et1].to_i(10),
1247
- m[:leg2_et2].to_i(10)] if m[:leg2_et1] && m[:leg2_et2]
1248
- score[:p] = [m[:leg2_p1].to_i(10),
1249
- m[:leg2_p2].to_i(10)] if m[:leg2_p1] && m[:leg2_p2]
1250
- legs['leg2'] = score
1251
-
1252
- ## check for (opt) aggregate - keep on "top-level"
1253
- legs[:agg] = [m[:agg1].to_i(10),
1254
- m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1255
- legs[:away] = true if m[:away]
1256
-
1257
- ## note - for debugging keep (pass along) "literal" score
1258
- [:SCORE_LEGS, [m[:score_legs], legs]]
1259
- elsif m[:score_full]
1260
- score = {}
1261
- score[:p] = [m[:p1].to_i(10),
1262
- m[:p2].to_i(10)] if m[:p1] && m[:p2]
1263
- score[:et] = [m[:et1].to_i(10),
1264
- m[:et2].to_i(10)] if m[:et1] && m[:et2]
1265
- score[:ft] = [m[:ft1].to_i(10),
1266
- m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
1267
- score[:ht] = [m[:ht1].to_i(10),
1268
- m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
1269
- score[:agg] = [m[:agg1].to_i(10),
1270
- m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1271
-
1272
- if m[:away1] && m[:away2]
1273
- score[:away] = [m[:away1].to_i(10),
1274
- m[:away2].to_i(10)]
1275
- elsif m[:away] ## fallback if no away score; check away flag
1276
- score[:away] = true
1277
- end
1278
-
1279
- ## add golden/silver flags
1280
- score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
1281
- score[:silver] = true if m[:aetsg] ## silver goal (sg)
1282
-
1283
- ## note - for debugging keep (pass along) "literal" score
1284
- [:SCORE_FULL, [m[:score_full], score]]
1285
- elsif m[:score_fuller]
1286
- score = {}
1287
- score[:p] = [m[:p1].to_i(10),
1288
- m[:p2].to_i(10)] if m[:p1] && m[:p2]
1289
- score[:et] = [m[:et1].to_i(10),
1290
- m[:et2].to_i(10)] if m[:et1] && m[:et2]
1291
- score[:ft] = [m[:ft1].to_i(10),
1292
- m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
1293
- score[:ht] = [m[:ht1].to_i(10),
1294
- m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
1295
- score[:agg] = [m[:agg1].to_i(10),
1296
- m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1297
- if m[:away1] && m[:away2]
1298
- score[:away] = [m[:away1].to_i(10),
1299
- m[:away2].to_i(10)]
1300
- elsif m[:away] ## fallback if no away score; check away flag
1301
- score[:away] = true
1302
- end
1303
-
1304
- ## add aet flag true/false
1305
- # score[:aet] = true if m[:aet] || m[:aetgg] || m[:aetsg]
1306
-
1307
- ## add golden/silver flags
1308
- score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
1309
- score[:silver] = true if m[:aetsg] ## silver goal (sg)
1310
-
1311
- ## note - for debugging keep (pass along) "literal" score
1312
- [:SCORE_FULLER, [m[:score_fuller], score]]
1313
- elsif m[:score_fuller_more]
1314
- ## SCORE + SCORE_FULLER_MORE
1315
- ## note - after extra-time (aet) or full-time (ft)
1316
- ## score may be present in SCORE!!!
1317
- score = {}
1318
- score[:p] = [m[:p1].to_i(10),
1319
- m[:p2].to_i(10)] if m[:p1] && m[:p2]
1320
- score[:et] = [m[:et1].to_i(10),
1321
- m[:et2].to_i(10)] if m[:et1] && m[:et2]
1322
- score[:ft] = [m[:ft1].to_i(10),
1323
- m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
1324
- score[:ht] = [m[:ht1].to_i(10),
1325
- m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
1326
- score[:agg] = [m[:agg1].to_i(10),
1327
- m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
1328
- if m[:away1] && m[:away2]
1329
- score[:away] = [m[:away1].to_i(10),
1330
- m[:away2].to_i(10)]
1331
- elsif m[:away] ## fallback if no away score; check away flag
1332
- score[:away] = true
1333
- end
1334
-
1335
- ## add flag in score for et/ft/ht
1336
- score[:score] = 'et' if m[:aet] || m[:aetgg] || m[:aetsg]
1337
- score[:score] = 'ft' if m[:ft]
1338
- score[:score] = 'ht' if m[:ht]
1339
-
1340
- ## add golden/silver flags
1341
- score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
1342
- score[:silver] = true if m[:aetsg] ## silver goal (sg)
1343
-
1344
- ## note - for debugging keep (pass along) "literal" score
1345
- [:SCORE_FULLER_MORE, [m[:score_fuller_more], score]]
1346
- elsif m[:score]
1347
- score = {}
1348
- ## note - score is "generic"
1349
- ## might be full-time (ft) or
1350
- ## after extra-time (aet) or such
1351
- ## or even undecided/unknown
1352
- ## thus, use score1/score2 and NOT ft1/ft2
1353
- score[:score] = [m[:score1].to_i(10),
1354
- m[:score2].to_i(10)]
1355
- ## note - for debugging keep (pass along) "literal" score
1356
- [:SCORE, [m[:score], score]]
1357
- elsif m[:score_awd] ## score awarded (awd/awd.)
1358
- score = {}
1359
- ### note - use "generic" score for now
1360
- ## to match A 3-0 B [awarded] etc.
1361
- score[:score] = [m[:score1].to_i(10),
1362
- m[:score2].to_i(10)]
1363
- ## add score[:awarded] = true ???
1364
- ## or only use match status to avoid duplicate?
1365
- [:SCORE_AWD, [m[:score_awd], score]]
1366
- elsif m[:score_abd] ## score abandonded (abd/abd.)
1367
- score = {}
1368
- ### note - use "generic" score for now
1369
- score[:score] = [m[:score1].to_i(10),
1370
- m[:score2].to_i(10)]
1371
- ## add score[:awarded] = true ???
1372
- ## or only use match status to avoid duplicate?
1373
- [:SCORE_ABD, [m[:score_abd], score]]
1374
- elsif m[:minute]
1375
- minute = {}
1376
- minute[:m] = m[:value].to_i(10)
1377
- minute[:offset] = m[:value2].to_i(10) if m[:value2]
1378
- ## note - for debugging keep (pass along) "literal" minute
1379
- [:MINUTE, [m[:minute], minute]]
1380
- elsif m[:vs]
1381
- [:VS, m[:vs]]
1382
- elsif m[:sym]
1383
- sym = m[:sym]
1384
- ## return symbols "inline" as is - why? why not?
1385
- ## (?<sym>[;,@|\[\]-])
1386
-
1387
- case sym
1388
- when '@' ## enter geo mode
1389
- puts " ENTER GEO_RE MODE" if debug?
1390
- @re = GEO_RE
1391
- geo_count = 0
1392
- [:'@']
1393
- when ',' then [:',']
1394
- when ';' then [:';']
1395
- when '/' then [:'/']
1396
- when '|' then [:'|']
1397
- when '[' then [:'[']
1398
- when ']' then [:']']
1399
- when '-' then [:'-']
1400
- when '(' ## enter goal scorer mode on "free-floating" open paranthesis!!!
1401
- puts " ENTER GOAL_RE MODE" if debug?
1402
- @re = GOAL_RE
1403
- ## note - eat-up ( for now; do NOT pass along as token
1404
- ## pass along "virutal" INLINE GOALS - why? why not?
1405
- [:INLINE_GOALS, "<|INLINE_GOALS|>"]
1406
- when ')' then [:')']
1407
- else
1408
- puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
1409
- nil ## ignore others (e.g. brackets [])
1410
- end
1411
- elsif m[:any]
1412
- ## todo/check log error
1413
- msg = "parse error (tokenize) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
1414
- puts "!! WARN - #{msg}"
1415
-
1416
- errors << msg
1417
- log( "!! WARN - #{msg}" )
1418
-
1419
- nil
1420
- else
1421
- ## report error
1422
- puts "!!! TOKENIZE ERROR - no match found"
1423
- nil
1424
- end
1425
- end
1426
-
1427
-
1428
- tokens << t if t
1429
-
1430
- # if debug?
1431
- # print ">"
1432
- # print "*" * pos
1433
- # puts "#{line[pos..-1]}<"
1434
- # end
1435
- end
1436
-
1437
- ## check if no match in end of string
1438
- if offsets[1] != line.size
1439
- msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
1440
- puts msg
1441
- log( msg )
1442
-
1443
- errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
1444
- end
1445
-
1446
-
1447
- # if @re == GOAL_RE ### ALWAYS switch back to top level mode
1448
- # puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
1449
- # @re = RE
1450
- # end
1451
-
1452
- if @re == GEO_RE ### ALWAYS switch back to top level mode
1453
- puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
1454
- @re = RE
1455
- end
1456
-
1457
- @re = RE if @re == GROUP_DEF_RE ### ALWAYS switch back to top level mode
1458
- @re = RE if @re == ROUND_DEF_RE
1459
-
1460
- ##
1461
- ## if in prop mode continue if last token is [,-]
1462
- ## otherwise change back to "standard" mode
1463
- if @re == PROP_RE || @re == PROP_CARDS_RE ||
1464
- @re == PROP_PENALTIES_RE ||
1465
- @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
1466
- if [:',', :'-', :';'].include?( tokens[-1][0] )
1467
- ## continue/stay in PROP_RE mode
1468
- ## todo/check - auto-add PROP_CONT token or such
1469
- ## to help parser with possible NEWLINE
1470
- ## conflicts - why? why not?
1471
- else
1472
- ## switch back to top-level mode!!
1473
- puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
1474
- @re = RE
1475
- ## note - auto-add PROP_END (<PROP_END>)
1476
- tokens << [:PROP_END, "<|PROP_END|>"]
1477
- end
1478
- end
1479
252
 
1480
-
1481
- [tokens,errors]
1482
- end
1483
253
 
1484
254
  end # class Lexer
1485
255
  end # module SportDb