sportdb-parser 0.6.3 → 0.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/lib/sportdb/parser/lexer.rb +196 -26
- data/lib/sportdb/parser/parser.rb +678 -449
- data/lib/sportdb/parser/racc_tree.rb +62 -3
- data/lib/sportdb/parser/token-date.rb +20 -0
- data/lib/sportdb/parser/token-minute.rb +140 -0
- data/lib/sportdb/parser/token-prop.rb +57 -9
- data/lib/sportdb/parser/token.rb +47 -10
- data/lib/sportdb/parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7cb697dbecb7802a9701c60527bb67125dc7ac550b6c5c80f67044e51683eb15
|
4
|
+
data.tar.gz: d53cc9c126b93a4702b2c633daf863a3e4d4b63397f7c5ac60bd0646ab1d2da9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 341aa9f1b25259db42452f28bd69f5643ecb5a3b45755a88565aac2b7a2842bc34d8d2a53e7d3b7a4b96c15c199c000167f4e94e608486a4ae9fefa9756bc7a0
|
7
|
+
data.tar.gz: d983116b79d9f361e136ef989ac02041ea735cfda42614f0ff08f5fd43899d7deebc135a82c020ff6dc7c9c6cf6ee413bbff7cdc9f65e41a682b3bf5cb5b082f
|
data/CHANGELOG.md
CHANGED
data/lib/sportdb/parser/lexer.rb
CHANGED
@@ -246,9 +246,10 @@ def tokenize_with_errors
|
|
246
246
|
nodes << [:TEAM, buf.next[1]]
|
247
247
|
nodes << buf.next
|
248
248
|
nodes << [:TEAM, buf.next[1]]
|
249
|
-
|
250
|
-
|
251
|
-
|
249
|
+
# note - now handled (upstream) with GOAL_RE mode!!!
|
250
|
+
# elsif buf.match?( :TEXT, :MINUTE )
|
251
|
+
# nodes << [:PLAYER, buf.next[1]]
|
252
|
+
# nodes << buf.next
|
252
253
|
elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
253
254
|
date = buf.next[1]
|
254
255
|
time = buf.next[1]
|
@@ -315,11 +316,103 @@ def _tokenize_line( line )
|
|
315
316
|
@re ||= RE ## note - switch between RE & INSIDE_RE
|
316
317
|
|
317
318
|
|
319
|
+
if @re == RE ## top-level
|
320
|
+
### check for modes once (per line) here to speed-up parsing
|
321
|
+
### for now goals only possible for start of line!!
|
322
|
+
### fix - remove optional [] - why? why not?
|
323
|
+
|
324
|
+
## start with prop key (match will switch into prop mode!!!)
|
325
|
+
## - fix - remove leading spaces in regex (upstream) - why? why not?
|
326
|
+
m = PROP_KEY_RE.match( line )
|
327
|
+
if m
|
328
|
+
### switch into new mode
|
329
|
+
## switch context to PROP_RE
|
330
|
+
puts " ENTER PROP_RE MODE" if debug?
|
331
|
+
key = m[:key]
|
332
|
+
|
333
|
+
|
334
|
+
### todo - add prop yellow/red cards too - why? why not?
|
335
|
+
if ['sent off', 'red cards'].include?( key.downcase)
|
336
|
+
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
337
|
+
tokens << [:PROP_REDCARDS, m[:key]]
|
338
|
+
elsif ['yellow cards'].include?( key.downcase )
|
339
|
+
@re = PROP_CARDS_RE
|
340
|
+
tokens << [:PROP_YELLOWCARDS, m[:key]]
|
341
|
+
elsif ['ref', 'referee'].include?( key.downcase )
|
342
|
+
@re = PROP_RE ## (re)use prop setup for now - why? why not?
|
343
|
+
tokens << [:PROP_REFEREE, m[:key]]
|
344
|
+
elsif ['goals'].include?( key.downcase )
|
345
|
+
@re = PROP_GOAL_RE
|
346
|
+
tokens << [:PROP_GOALS, m[:key]]
|
347
|
+
else ## assume (team) line-up
|
348
|
+
@re = PROP_RE ## use LINEUP_RE ???
|
349
|
+
tokens << [:PROP, m[:key]]
|
350
|
+
end
|
351
|
+
|
352
|
+
offsets = [m.begin(0), m.end(0)]
|
353
|
+
pos = offsets[1] ## update pos
|
354
|
+
end
|
355
|
+
|
356
|
+
m = PLAYER_WITH_SCORE_RE.match( line )
|
357
|
+
if m
|
358
|
+
## switch context to GOAL_RE (goalline(s)
|
359
|
+
## split token (automagically) into two!! - player AND minute!!!
|
360
|
+
@re = GOAL_RE
|
361
|
+
puts " ENTER GOAL_RE MODE" if debug?
|
362
|
+
|
363
|
+
score = {}
|
364
|
+
## must always have ft for now e.g. 1-1 or such
|
365
|
+
### change to (generic) score from ft -
|
366
|
+
## might be score a.e.t. or such - why? why not?
|
367
|
+
score[:ft] = [m[:ft1].to_i(10),
|
368
|
+
m[:ft2].to_i(10)]
|
369
|
+
## note - for debugging keep (pass along) "literal" score
|
370
|
+
tokens << [:SCORE, [m[:score], score]]
|
371
|
+
|
372
|
+
## auto-add player token
|
373
|
+
tokens << [:PLAYER, m[:name]]
|
374
|
+
|
375
|
+
offsets = [m.begin(0), m.end(0)]
|
376
|
+
pos = offsets[1] ## update pos
|
377
|
+
end
|
378
|
+
|
379
|
+
m = PLAYER_WITH_MINUTE_RE.match( line )
|
380
|
+
if m
|
381
|
+
## switch context to GOAL_RE (goalline(s)
|
382
|
+
## split token (automagically) into two!! - player AND minute!!!
|
383
|
+
@re = GOAL_RE
|
384
|
+
puts " ENTER GOAL_RE MODE" if debug?
|
385
|
+
|
386
|
+
## check for optional open_bracket
|
387
|
+
tokens << [:'['] if m[:open_bracket]
|
388
|
+
|
389
|
+
## check for -; (none with separator)
|
390
|
+
## todo - find a better way? how possible?
|
391
|
+
tokens << [:NONE, "<|NONE|>"] if m[:none]
|
392
|
+
|
393
|
+
|
394
|
+
|
395
|
+
## auto-add player token first
|
396
|
+
tokens << [:PLAYER, m[:name]]
|
397
|
+
## minute props
|
398
|
+
minute = {}
|
399
|
+
minute[:m] = m[:value].to_i(10)
|
400
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
401
|
+
## t is minute only
|
402
|
+
tokens << [:MINUTE, [m[:minute], minute]]
|
403
|
+
|
404
|
+
offsets = [m.begin(0), m.end(0)]
|
405
|
+
pos = offsets[1] ## update pos
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
|
410
|
+
|
318
411
|
while m = @re.match( line, pos )
|
319
|
-
#
|
320
|
-
#
|
321
|
-
#
|
322
|
-
#
|
412
|
+
# if debug?
|
413
|
+
# pp m
|
414
|
+
# puts "pos: #{pos}"
|
415
|
+
# end
|
323
416
|
offsets = [m.begin(0), m.end(0)]
|
324
417
|
|
325
418
|
if offsets[0] != pos
|
@@ -345,10 +438,44 @@ def _tokenize_line( line )
|
|
345
438
|
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
346
439
|
## for VAL use "text" or ["text", { opts }] array
|
347
440
|
|
348
|
-
|
349
|
-
|
441
|
+
t = if @re == PROP_CARDS_RE
|
442
|
+
if m[:space] || m[:spaces]
|
443
|
+
nil ## skip space(s)
|
444
|
+
elsif m[:prop_name]
|
445
|
+
[:PROP_NAME, m[:name]]
|
446
|
+
elsif m[:minute]
|
447
|
+
minute = {}
|
448
|
+
minute[:m] = m[:value].to_i(10)
|
449
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
450
|
+
## note - for debugging keep (pass along) "literal" minute
|
451
|
+
[:MINUTE, [m[:minute], minute]]
|
452
|
+
elsif m[:sym]
|
453
|
+
sym = m[:sym]
|
454
|
+
case sym
|
455
|
+
when ',' then [:',']
|
456
|
+
when ';' then [:';']
|
457
|
+
when '-' then [:'-']
|
458
|
+
else
|
459
|
+
nil ## ignore others (e.g. brackets [])
|
460
|
+
end
|
461
|
+
else
|
462
|
+
## report error
|
463
|
+
puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found"
|
464
|
+
nil
|
465
|
+
end
|
466
|
+
elsif @re == PROP_RE ### todo/fix - change to LINEUP_RE !!!!
|
350
467
|
if m[:space] || m[:spaces]
|
351
468
|
nil ## skip space(s)
|
469
|
+
elsif m[:prop_key] ## check for inline prop keys
|
470
|
+
key = m[:key]
|
471
|
+
## supported for now coach/trainer (add manager?)
|
472
|
+
if ['coach',
|
473
|
+
'trainer'].include?( key.downcase )
|
474
|
+
[:COACH, m[:key]] ## use COACH_KEY or such - why? why not?
|
475
|
+
else
|
476
|
+
## report error - for unknown (inline) prop key in lineup
|
477
|
+
nil
|
478
|
+
end
|
352
479
|
elsif m[:prop_name]
|
353
480
|
if m[:name] == 'Y'
|
354
481
|
[:YELLOW_CARD, m[:name]]
|
@@ -376,11 +503,6 @@ def _tokenize_line( line )
|
|
376
503
|
when '(' then [:'(']
|
377
504
|
when ')' then [:')']
|
378
505
|
when '-' then [:'-']
|
379
|
-
# when '.' then
|
380
|
-
# ## switch back to top-level mode!!
|
381
|
-
# puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
382
|
-
# @re = RE
|
383
|
-
# [:'.']
|
384
506
|
else
|
385
507
|
nil ## ignore others (e.g. brackets [])
|
386
508
|
end
|
@@ -389,16 +511,53 @@ def _tokenize_line( line )
|
|
389
511
|
puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
|
390
512
|
nil
|
391
513
|
end
|
514
|
+
elsif @re == GOAL_RE || @re == PROP_GOAL_RE
|
515
|
+
if m[:space] || m[:spaces]
|
516
|
+
nil ## skip space(s)
|
517
|
+
elsif m[:prop_name] ## note - change prop_name to player
|
518
|
+
[:PLAYER, m[:name]]
|
519
|
+
elsif m[:minute]
|
520
|
+
minute = {}
|
521
|
+
minute[:m] = m[:value].to_i(10)
|
522
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
523
|
+
## note - for debugging keep (pass along) "literal" minute
|
524
|
+
[:MINUTE, [m[:minute], minute]]
|
525
|
+
elsif m[:score]
|
526
|
+
score = {}
|
527
|
+
## must always have ft for now e.g. 1-1 or such
|
528
|
+
### change to (generic) score from ft -
|
529
|
+
## might be score a.e.t. or such - why? why not?
|
530
|
+
score[:ft] = [m[:ft1].to_i(10),
|
531
|
+
m[:ft2].to_i(10)]
|
532
|
+
## note - for debugging keep (pass along) "literal" score
|
533
|
+
[:SCORE, [m[:score], score]]
|
534
|
+
elsif m[:og]
|
535
|
+
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
536
|
+
elsif m[:pen]
|
537
|
+
[:PEN, m[:pen]]
|
538
|
+
elsif m[:sym]
|
539
|
+
sym = m[:sym]
|
540
|
+
## return symbols "inline" as is - why? why not?
|
541
|
+
## (?<sym>[;,@|\[\]-])
|
542
|
+
|
543
|
+
case sym
|
544
|
+
when ',' then [:',']
|
545
|
+
when ';' then [:';']
|
546
|
+
when '[' then [:'[']
|
547
|
+
when ']' then [:']']
|
548
|
+
else
|
549
|
+
nil ## ignore others (e.g. brackets [])
|
550
|
+
end
|
551
|
+
else
|
552
|
+
## report error
|
553
|
+
puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
|
554
|
+
nil
|
555
|
+
end
|
392
556
|
###################################################
|
393
557
|
## assume TOP_LEVEL (a.k.a. RE) machinery
|
394
558
|
else
|
395
559
|
if m[:space] || m[:spaces]
|
396
560
|
nil ## skip space(s)
|
397
|
-
elsif m[:prop_key]
|
398
|
-
## switch context to PROP_RE
|
399
|
-
@re = PROP_RE
|
400
|
-
puts " ENTER PROP_RE MODE" if debug?
|
401
|
-
[:PROP, m[:key]]
|
402
561
|
elsif m[:text]
|
403
562
|
[:TEXT, m[:text]] ## keep pos - why? why not?
|
404
563
|
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
@@ -436,7 +595,9 @@ def _tokenize_line( line )
|
|
436
595
|
date = {}
|
437
596
|
## map month names
|
438
597
|
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
439
|
-
date[:y]
|
598
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
599
|
+
## check - use y too for two-digit year or keep separate - why? why not?
|
600
|
+
date[:yy] = m[:yy].to_i(10) if m[:yy] ## two digit year (e.g. 25 or 78 etc.)
|
440
601
|
date[:m] = m[:month].to_i(10) if m[:month]
|
441
602
|
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
442
603
|
date[:d] = m[:day].to_i(10) if m[:day]
|
@@ -480,6 +641,8 @@ def _tokenize_line( line )
|
|
480
641
|
elsif m[:score]
|
481
642
|
score = {}
|
482
643
|
## must always have ft for now e.g. 1-1 or such
|
644
|
+
### change to (generic) score from ft -
|
645
|
+
## might be score a.e.t. or such - why? why not?
|
483
646
|
score[:ft] = [m[:ft1].to_i(10),
|
484
647
|
m[:ft2].to_i(10)]
|
485
648
|
## note - for debugging keep (pass along) "literal" score
|
@@ -490,10 +653,6 @@ def _tokenize_line( line )
|
|
490
653
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
491
654
|
## note - for debugging keep (pass along) "literal" minute
|
492
655
|
[:MINUTE, [m[:minute], minute]]
|
493
|
-
elsif m[:og]
|
494
|
-
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
495
|
-
elsif m[:pen]
|
496
|
-
[:PEN, m[:pen]]
|
497
656
|
elsif m[:vs]
|
498
657
|
[:VS, m[:vs]]
|
499
658
|
elsif m[:sym]
|
@@ -514,8 +673,13 @@ def _tokenize_line( line )
|
|
514
673
|
when '---' then [:'---'] # level 3
|
515
674
|
when '----' then [:'----'] # level 4
|
516
675
|
else
|
676
|
+
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
517
677
|
nil ## ignore others (e.g. brackets [])
|
518
678
|
end
|
679
|
+
elsif m[:any]
|
680
|
+
## todo/check log error
|
681
|
+
puts "!!! TOKENIZE ERROR (any) - no match found >#{m[:any]}<"
|
682
|
+
nil
|
519
683
|
else
|
520
684
|
## report error
|
521
685
|
puts "!!! TOKENIZE ERROR - no match found"
|
@@ -546,8 +710,8 @@ def _tokenize_line( line )
|
|
546
710
|
##
|
547
711
|
## if in prop mode continue if last token is [,-]
|
548
712
|
## otherwise change back to "standard" mode
|
549
|
-
if @re == PROP_RE
|
550
|
-
if [:',', :'-'].include?( tokens[-1][0] )
|
713
|
+
if @re == PROP_RE || @re == PROP_CARDS_RE || @re == PROP_GOAL_RE
|
714
|
+
if [:',', :'-', :';'].include?( tokens[-1][0] )
|
551
715
|
## continue/stay in PROP_RE mode
|
552
716
|
## todo/check - auto-add PROP_CONT token or such
|
553
717
|
## to help parser with possible NEWLINE
|
@@ -560,6 +724,12 @@ def _tokenize_line( line )
|
|
560
724
|
tokens << [:PROP_END, "<|PROP_END|>"]
|
561
725
|
end
|
562
726
|
end
|
727
|
+
|
728
|
+
|
729
|
+
if @re == GOAL_RE ### ALWAYS switch back to top level mode
|
730
|
+
puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
731
|
+
@re = RE
|
732
|
+
end
|
563
733
|
|
564
734
|
[tokens,errors]
|
565
735
|
end
|