sportdb-parser 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/lib/sportdb/parser/lexer.rb +140 -17
- data/lib/sportdb/parser/parser.rb +414 -354
- data/lib/sportdb/parser/racc_tree.rb +24 -0
- data/lib/sportdb/parser/token-date.rb +20 -0
- data/lib/sportdb/parser/token-minute.rb +140 -0
- data/lib/sportdb/parser/token-prop.rb +17 -9
- data/lib/sportdb/parser/token.rb +39 -10
- data/lib/sportdb/parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00ec5bcacfe56b29e9589507c11d3bfe361caed4b45ebdfa3b05901c8229b019
|
4
|
+
data.tar.gz: bd36a9b6c0b84a9a033d721c4adf086ff25703ed31dde3fc8265c421ba6273c1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db5568eb30b924f0e963402ed3089edc1b1cdbcf31cfaa4177da6215b82e0646badd428e94a14a42abfbb4688c7543f8b04f698eb1551082e9731b1937a23e19
|
7
|
+
data.tar.gz: d3619f1ea496cf4fdc08d0904afee295679d6f1aa0c35aacd09ccf8e19ad00beb78f832efef3f24189b8793b0dd24064544854e84d7d26653398962d79bda311
|
data/CHANGELOG.md
CHANGED
data/lib/sportdb/parser/lexer.rb
CHANGED
@@ -246,9 +246,10 @@ def tokenize_with_errors
|
|
246
246
|
nodes << [:TEAM, buf.next[1]]
|
247
247
|
nodes << buf.next
|
248
248
|
nodes << [:TEAM, buf.next[1]]
|
249
|
-
|
250
|
-
|
251
|
-
|
249
|
+
# note - now handled (upstream) with GOAL_RE mode!!!
|
250
|
+
# elsif buf.match?( :TEXT, :MINUTE )
|
251
|
+
# nodes << [:PLAYER, buf.next[1]]
|
252
|
+
# nodes << buf.next
|
252
253
|
elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
253
254
|
date = buf.next[1]
|
254
255
|
time = buf.next[1]
|
@@ -315,11 +316,85 @@ def _tokenize_line( line )
|
|
315
316
|
@re ||= RE ## note - switch between RE & INSIDE_RE
|
316
317
|
|
317
318
|
|
319
|
+
if @re == RE ## top-level
|
320
|
+
### check for modes once (per line) here to speed-up parsing
|
321
|
+
### for now goals only possible for start of line!!
|
322
|
+
### fix - remove optional [] - why? why not?
|
323
|
+
|
324
|
+
## start with prop key (match will switch into prop mode!!!)
|
325
|
+
## - fix - remove leading spaces in regex (upstream) - why? why not?
|
326
|
+
m = PROP_KEY_RE.match( line )
|
327
|
+
if m
|
328
|
+
### switch into new mode
|
329
|
+
## switch context to PROP_RE
|
330
|
+
@re = PROP_RE
|
331
|
+
puts " ENTER PROP_RE MODE" if debug?
|
332
|
+
tokens << [:PROP, m[:key]]
|
333
|
+
|
334
|
+
offsets = [m.begin(0), m.end(0)]
|
335
|
+
pos = offsets[1] ## update pos
|
336
|
+
end
|
337
|
+
|
338
|
+
m = PLAYER_WITH_SCORE_RE.match( line )
|
339
|
+
if m
|
340
|
+
## switch context to GOAL_RE (goalline(s)
|
341
|
+
## split token (automagically) into two!! - player AND minute!!!
|
342
|
+
@re = GOAL_RE
|
343
|
+
puts " ENTER GOAL_RE MODE" if debug?
|
344
|
+
|
345
|
+
score = {}
|
346
|
+
## must always have ft for now e.g. 1-1 or such
|
347
|
+
### change to (generic) score from ft -
|
348
|
+
## might be score a.e.t. or such - why? why not?
|
349
|
+
score[:ft] = [m[:ft1].to_i(10),
|
350
|
+
m[:ft2].to_i(10)]
|
351
|
+
## note - for debugging keep (pass along) "literal" score
|
352
|
+
tokens << [:SCORE, [m[:score], score]]
|
353
|
+
|
354
|
+
## auto-add player token
|
355
|
+
tokens << [:PLAYER, m[:name]]
|
356
|
+
|
357
|
+
offsets = [m.begin(0), m.end(0)]
|
358
|
+
pos = offsets[1] ## update pos
|
359
|
+
end
|
360
|
+
|
361
|
+
m = PLAYER_WITH_MINUTE_RE.match( line )
|
362
|
+
if m
|
363
|
+
## switch context to GOAL_RE (goalline(s)
|
364
|
+
## split token (automagically) into two!! - player AND minute!!!
|
365
|
+
@re = GOAL_RE
|
366
|
+
puts " ENTER GOAL_RE MODE" if debug?
|
367
|
+
|
368
|
+
## check for optional open_bracket
|
369
|
+
tokens << [:'['] if m[:open_bracket]
|
370
|
+
|
371
|
+
## check for -; (none with separator)
|
372
|
+
## todo - find a better way? how possible?
|
373
|
+
tokens << [:NONE, "<|NONE|>"] if m[:none]
|
374
|
+
|
375
|
+
|
376
|
+
|
377
|
+
## auto-add player token first
|
378
|
+
tokens << [:PLAYER, m[:name]]
|
379
|
+
## minute props
|
380
|
+
minute = {}
|
381
|
+
minute[:m] = m[:value].to_i(10)
|
382
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
383
|
+
## t is minute only
|
384
|
+
tokens << [:MINUTE, [m[:minute], minute]]
|
385
|
+
|
386
|
+
offsets = [m.begin(0), m.end(0)]
|
387
|
+
pos = offsets[1] ## update pos
|
388
|
+
end
|
389
|
+
end
|
390
|
+
|
391
|
+
|
392
|
+
|
318
393
|
while m = @re.match( line, pos )
|
319
|
-
#
|
320
|
-
#
|
321
|
-
#
|
322
|
-
#
|
394
|
+
# if debug?
|
395
|
+
# pp m
|
396
|
+
# puts "pos: #{pos}"
|
397
|
+
# end
|
323
398
|
offsets = [m.begin(0), m.end(0)]
|
324
399
|
|
325
400
|
if offsets[0] != pos
|
@@ -389,16 +464,53 @@ def _tokenize_line( line )
|
|
389
464
|
puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
|
390
465
|
nil
|
391
466
|
end
|
467
|
+
elsif @re == GOAL_RE
|
468
|
+
if m[:space] || m[:spaces]
|
469
|
+
nil ## skip space(s)
|
470
|
+
elsif m[:prop_name] ## note - change prop_name to player
|
471
|
+
[:PLAYER, m[:name]]
|
472
|
+
elsif m[:minute]
|
473
|
+
minute = {}
|
474
|
+
minute[:m] = m[:value].to_i(10)
|
475
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
476
|
+
## note - for debugging keep (pass along) "literal" minute
|
477
|
+
[:MINUTE, [m[:minute], minute]]
|
478
|
+
elsif m[:score]
|
479
|
+
score = {}
|
480
|
+
## must always have ft for now e.g. 1-1 or such
|
481
|
+
### change to (generic) score from ft -
|
482
|
+
## might be score a.e.t. or such - why? why not?
|
483
|
+
score[:ft] = [m[:ft1].to_i(10),
|
484
|
+
m[:ft2].to_i(10)]
|
485
|
+
## note - for debugging keep (pass along) "literal" score
|
486
|
+
[:SCORE, [m[:score], score]]
|
487
|
+
elsif m[:og]
|
488
|
+
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
489
|
+
elsif m[:pen]
|
490
|
+
[:PEN, m[:pen]]
|
491
|
+
elsif m[:sym]
|
492
|
+
sym = m[:sym]
|
493
|
+
## return symbols "inline" as is - why? why not?
|
494
|
+
## (?<sym>[;,@|\[\]-])
|
495
|
+
|
496
|
+
case sym
|
497
|
+
when ',' then [:',']
|
498
|
+
when ';' then [:';']
|
499
|
+
when '[' then [:'[']
|
500
|
+
when ']' then [:']']
|
501
|
+
else
|
502
|
+
nil ## ignore others (e.g. brackets [])
|
503
|
+
end
|
504
|
+
else
|
505
|
+
## report error
|
506
|
+
puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
|
507
|
+
nil
|
508
|
+
end
|
392
509
|
###################################################
|
393
510
|
## assume TOP_LEVEL (a.k.a. RE) machinery
|
394
511
|
else
|
395
512
|
if m[:space] || m[:spaces]
|
396
513
|
nil ## skip space(s)
|
397
|
-
elsif m[:prop_key]
|
398
|
-
## switch context to PROP_RE
|
399
|
-
@re = PROP_RE
|
400
|
-
puts " ENTER PROP_RE MODE" if debug?
|
401
|
-
[:PROP, m[:key]]
|
402
514
|
elsif m[:text]
|
403
515
|
[:TEXT, m[:text]] ## keep pos - why? why not?
|
404
516
|
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
@@ -436,7 +548,9 @@ def _tokenize_line( line )
|
|
436
548
|
date = {}
|
437
549
|
## map month names
|
438
550
|
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
439
|
-
date[:y]
|
551
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
552
|
+
## check - use y too for two-digit year or keep separate - why? why not?
|
553
|
+
date[:yy] = m[:yy].to_i(10) if m[:yy] ## two digit year (e.g. 25 or 78 etc.)
|
440
554
|
date[:m] = m[:month].to_i(10) if m[:month]
|
441
555
|
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
442
556
|
date[:d] = m[:day].to_i(10) if m[:day]
|
@@ -480,6 +594,8 @@ def _tokenize_line( line )
|
|
480
594
|
elsif m[:score]
|
481
595
|
score = {}
|
482
596
|
## must always have ft for now e.g. 1-1 or such
|
597
|
+
### change to (generic) score from ft -
|
598
|
+
## might be score a.e.t. or such - why? why not?
|
483
599
|
score[:ft] = [m[:ft1].to_i(10),
|
484
600
|
m[:ft2].to_i(10)]
|
485
601
|
## note - for debugging keep (pass along) "literal" score
|
@@ -490,10 +606,6 @@ def _tokenize_line( line )
|
|
490
606
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
491
607
|
## note - for debugging keep (pass along) "literal" minute
|
492
608
|
[:MINUTE, [m[:minute], minute]]
|
493
|
-
elsif m[:og]
|
494
|
-
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
495
|
-
elsif m[:pen]
|
496
|
-
[:PEN, m[:pen]]
|
497
609
|
elsif m[:vs]
|
498
610
|
[:VS, m[:vs]]
|
499
611
|
elsif m[:sym]
|
@@ -514,8 +626,13 @@ def _tokenize_line( line )
|
|
514
626
|
when '---' then [:'---'] # level 3
|
515
627
|
when '----' then [:'----'] # level 4
|
516
628
|
else
|
629
|
+
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
517
630
|
nil ## ignore others (e.g. brackets [])
|
518
631
|
end
|
632
|
+
elsif m[:any]
|
633
|
+
## todo/check log error
|
634
|
+
puts "!!! TOKENIZE ERROR (any) - no match found >#{m[:any]}<"
|
635
|
+
nil
|
519
636
|
else
|
520
637
|
## report error
|
521
638
|
puts "!!! TOKENIZE ERROR - no match found"
|
@@ -560,6 +677,12 @@ def _tokenize_line( line )
|
|
560
677
|
tokens << [:PROP_END, "<|PROP_END|>"]
|
561
678
|
end
|
562
679
|
end
|
680
|
+
|
681
|
+
|
682
|
+
if @re == GOAL_RE ### ALWAYS switch back to top level mode
|
683
|
+
puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
684
|
+
@re = RE
|
685
|
+
end
|
563
686
|
|
564
687
|
[tokens,errors]
|
565
688
|
end
|