sportdb-parser 0.5.9 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +2 -0
- data/lib/sportdb/parser/lexer.rb +101 -36
- data/lib/sportdb/parser/parser.rb +561 -387
- data/lib/sportdb/parser/racc_parser.rb +5 -3
- data/lib/sportdb/parser/racc_tree.rb +12 -5
- data/lib/sportdb/parser/token-date.rb +81 -13
- data/lib/sportdb/parser/token-minute.rb +45 -0
- data/lib/sportdb/parser/token-prop.rb +133 -0
- data/lib/sportdb/parser/token-score.rb +25 -14
- data/lib/sportdb/parser/token-text.rb +9 -2
- data/lib/sportdb/parser/token.rb +51 -176
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +2 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b299ddece5e64b86bb7ee6b55578099b0624b11d8e5f10721363f45d6ef5d8d8
|
4
|
+
data.tar.gz: 5712c99b200e6116c9f07fba1215a4bf2560e5bd848c3c8cc48959aa17997b85
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d2fce54482e12542c35abd46a292d7f5e6b4db894bba3a7f911269f116d9fa530653d36ace4295e2f819bb974093b5567a5494a2d50b54ee3f250b314d40a73
|
7
|
+
data.tar.gz: 7b6ef8aaafa2d20c0356fcdc048211f24a04cc4f95819ad8d225b2c9a4a29e44d8f415190acfbe3e31b2f9cc457a12f8e75c460394e984d5b9b1f476f0f8e30f
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -14,6 +14,8 @@ lib/sportdb/parser/parser.rb
|
|
14
14
|
lib/sportdb/parser/racc_parser.rb
|
15
15
|
lib/sportdb/parser/racc_tree.rb
|
16
16
|
lib/sportdb/parser/token-date.rb
|
17
|
+
lib/sportdb/parser/token-minute.rb
|
18
|
+
lib/sportdb/parser/token-prop.rb
|
17
19
|
lib/sportdb/parser/token-score.rb
|
18
20
|
lib/sportdb/parser/token-status.rb
|
19
21
|
lib/sportdb/parser/token-text.rb
|
data/lib/sportdb/parser/lexer.rb
CHANGED
@@ -147,12 +147,15 @@ def initialize( lines, debug: false )
|
|
147
147
|
## strip lines with comments and empty lines striped / removed
|
148
148
|
## keep empty lines? why? why not?
|
149
149
|
## keep leading spaces (indent) - why?
|
150
|
+
##
|
151
|
+
## note - KEEP empty lines (get turned into BLANK token!!!!)
|
152
|
+
|
150
153
|
@txt = String.new
|
151
154
|
txt_pre.each_line do |line| ## preprocess
|
152
155
|
line = line.strip
|
153
|
-
next if line.
|
156
|
+
next if line.start_with?('#') ### skip comments
|
154
157
|
|
155
|
-
line = line.sub( /#.*/, '' ).strip
|
158
|
+
line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
|
156
159
|
|
157
160
|
@txt << line
|
158
161
|
@txt << "\n"
|
@@ -193,6 +196,18 @@ def tokenize_with_errors
|
|
193
196
|
t
|
194
197
|
end
|
195
198
|
|
199
|
+
### check for "section" starters e.g. Teams or such
|
200
|
+
t = tokens[0]
|
201
|
+
if t[0] == :TEXT
|
202
|
+
text = t[1]
|
203
|
+
if text =~ /^teams$/i
|
204
|
+
t[0] = :TEAMS
|
205
|
+
elsif text =~ /^blank$/i ### todo/fix -- remove!!! add real blanks!!
|
206
|
+
t[0] = :BLANK
|
207
|
+
else
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
196
211
|
#################
|
197
212
|
## pass 2
|
198
213
|
## transform tokens (using simple patterns)
|
@@ -227,13 +242,22 @@ def tokenize_with_errors
|
|
227
242
|
end
|
228
243
|
|
229
244
|
|
230
|
-
if buf.match?( :TEXT, [:SCORE, :VS, :'-'], :TEXT )
|
245
|
+
if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT )
|
231
246
|
nodes << [:TEAM, buf.next[1]]
|
232
247
|
nodes << buf.next
|
233
248
|
nodes << [:TEAM, buf.next[1]]
|
234
249
|
elsif buf.match?( :TEXT, :MINUTE )
|
235
250
|
nodes << [:PLAYER, buf.next[1]]
|
236
251
|
nodes << buf.next
|
252
|
+
elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
253
|
+
date = buf.next[1]
|
254
|
+
time = buf.next[1]
|
255
|
+
## puts "DATETIME:"
|
256
|
+
## pp date, time
|
257
|
+
val = [date[0] + ' ' + time[0], ## concat string of two tokens
|
258
|
+
{ date: date[1], time: time[1] }
|
259
|
+
]
|
260
|
+
nodes << [:DATETIME, val]
|
237
261
|
else
|
238
262
|
## pass through
|
239
263
|
nodes << buf.next
|
@@ -247,8 +271,14 @@ def tokenize_with_errors
|
|
247
271
|
## flatten tokens
|
248
272
|
tokens = []
|
249
273
|
tokens_by_line.each do |tok|
|
274
|
+
|
275
|
+
if debug?
|
276
|
+
pp tok
|
277
|
+
end
|
278
|
+
|
250
279
|
tokens += tok
|
251
|
-
|
280
|
+
## auto-add newlines (unless BLANK!!)
|
281
|
+
tokens << [:NEWLINE, "\n"] unless tok[0][0] == :BLANK
|
252
282
|
end
|
253
283
|
|
254
284
|
[tokens,errors]
|
@@ -260,7 +290,18 @@ def _tokenize_line( line )
|
|
260
290
|
tokens = []
|
261
291
|
errors = [] ## keep a list of errors - why? why not?
|
262
292
|
|
263
|
-
puts ">#{line}<" if debug?
|
293
|
+
puts "line: >#{line}<" if debug?
|
294
|
+
|
295
|
+
|
296
|
+
### special case for empty line (aka BLANK)
|
297
|
+
if line.empty?
|
298
|
+
## note - blank always resets parser mode to std/top-level!!!
|
299
|
+
@re = RE
|
300
|
+
|
301
|
+
tokens << [:BLANK, '<|BLANK|>']
|
302
|
+
return [tokens, errors]
|
303
|
+
end
|
304
|
+
|
264
305
|
|
265
306
|
pos = 0
|
266
307
|
## track last offsets - to report error on no match
|
@@ -275,10 +316,10 @@ def _tokenize_line( line )
|
|
275
316
|
|
276
317
|
|
277
318
|
while m = @re.match( line, pos )
|
278
|
-
if debug?
|
279
|
-
pp m
|
280
|
-
puts "pos: #{pos}"
|
281
|
-
end
|
319
|
+
# if debug?
|
320
|
+
# pp m
|
321
|
+
# puts "pos: #{pos}"
|
322
|
+
# end
|
282
323
|
offsets = [m.begin(0), m.end(0)]
|
283
324
|
|
284
325
|
if offsets[0] != pos
|
@@ -298,7 +339,7 @@ def _tokenize_line( line )
|
|
298
339
|
|
299
340
|
pos = offsets[1]
|
300
341
|
|
301
|
-
pp offsets if debug?
|
342
|
+
# pp offsets if debug?
|
302
343
|
|
303
344
|
##
|
304
345
|
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
@@ -306,12 +347,8 @@ def _tokenize_line( line )
|
|
306
347
|
|
307
348
|
|
308
349
|
t = if @re == PROP_RE
|
309
|
-
if m[:space]
|
310
|
-
## skip space
|
311
|
-
nil
|
312
|
-
elsif m[:spaces]
|
313
|
-
## skip spaces
|
314
|
-
nil
|
350
|
+
if m[:space] || m[:spaces]
|
351
|
+
nil ## skip space(s)
|
315
352
|
elsif m[:prop_name]
|
316
353
|
if m[:name] == 'Y'
|
317
354
|
[:YELLOW_CARD, m[:name]]
|
@@ -339,11 +376,11 @@ def _tokenize_line( line )
|
|
339
376
|
when '(' then [:'(']
|
340
377
|
when ')' then [:')']
|
341
378
|
when '-' then [:'-']
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
379
|
+
# when '.' then
|
380
|
+
# ## switch back to top-level mode!!
|
381
|
+
# puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
382
|
+
# @re = RE
|
383
|
+
# [:'.']
|
347
384
|
else
|
348
385
|
nil ## ignore others (e.g. brackets [])
|
349
386
|
end
|
@@ -353,12 +390,8 @@ def _tokenize_line( line )
|
|
353
390
|
nil
|
354
391
|
end
|
355
392
|
else ## assume TOP_LEVEL (a.k.a. RE) machinery
|
356
|
-
if m[:space]
|
357
|
-
## skip space
|
358
|
-
nil
|
359
|
-
elsif m[:spaces]
|
360
|
-
## skip spaces
|
361
|
-
nil
|
393
|
+
if m[:space] || m[:spaces]
|
394
|
+
nil ## skip space(s)
|
362
395
|
elsif m[:prop_key]
|
363
396
|
## switch context to PROP_RE
|
364
397
|
@re = PROP_RE
|
@@ -397,6 +430,7 @@ def _tokenize_line( line )
|
|
397
430
|
## map month names
|
398
431
|
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
399
432
|
date[:y] = m[:year].to_i(10) if m[:year]
|
433
|
+
date[:m] = m[:month].to_i(10) if m[:month]
|
400
434
|
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
401
435
|
date[:d] = m[:day].to_i(10) if m[:day]
|
402
436
|
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
@@ -417,10 +451,12 @@ def _tokenize_line( line )
|
|
417
451
|
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
418
452
|
## note - for debugging keep (pass along) "literal" duration
|
419
453
|
[:DURATION, [m[:duration], duration]]
|
454
|
+
elsif m[:wday] ## standalone weekday e.g. Mo/Tu/We/etc.
|
455
|
+
[:WDAY, [m[:wday], { wday: DAY_MAP[ m[:day_name].downcase ] } ]]
|
420
456
|
elsif m[:num] ## fix - change to ord (for ordinal number!!!)
|
421
457
|
## note - strip enclosing () and convert to integer
|
422
458
|
[:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
|
423
|
-
elsif m[:
|
459
|
+
elsif m[:score_more]
|
424
460
|
score = {}
|
425
461
|
## check for pen
|
426
462
|
score[:p] = [m[:p1].to_i(10),
|
@@ -433,8 +469,15 @@ def _tokenize_line( line )
|
|
433
469
|
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
434
470
|
|
435
471
|
## note - for debugging keep (pass along) "literal" score
|
436
|
-
[:
|
437
|
-
elsif m[:
|
472
|
+
[:SCORE_MORE, [m[:score_more], score]]
|
473
|
+
elsif m[:score]
|
474
|
+
score = {}
|
475
|
+
## must always have ft for now e.g. 1-1 or such
|
476
|
+
score[:ft] = [m[:ft1].to_i(10),
|
477
|
+
m[:ft2].to_i(10)]
|
478
|
+
## note - for debugging keep (pass along) "literal" score
|
479
|
+
[:SCORE, [m[:score], score]]
|
480
|
+
elsif m[:minute]
|
438
481
|
minute = {}
|
439
482
|
minute[:m] = m[:value].to_i(10)
|
440
483
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
@@ -454,11 +497,15 @@ def _tokenize_line( line )
|
|
454
497
|
case sym
|
455
498
|
when ',' then [:',']
|
456
499
|
when ';' then [:';']
|
500
|
+
when '/' then [:'/']
|
457
501
|
when '@' then [:'@']
|
458
502
|
when '|' then [:'|']
|
459
503
|
when '[' then [:'[']
|
460
504
|
when ']' then [:']']
|
461
|
-
when '-' then [:'-']
|
505
|
+
when '-' then [:'-'] # level 1 OR (classic) dash
|
506
|
+
when '--' then [:'--'] # level 2
|
507
|
+
when '---' then [:'---'] # level 3
|
508
|
+
when '----' then [:'----'] # level 4
|
462
509
|
else
|
463
510
|
nil ## ignore others (e.g. brackets [])
|
464
511
|
end
|
@@ -472,11 +519,11 @@ def _tokenize_line( line )
|
|
472
519
|
|
473
520
|
tokens << t if t
|
474
521
|
|
475
|
-
if debug?
|
476
|
-
print ">"
|
477
|
-
print "*" * pos
|
478
|
-
puts "#{line[pos..-1]}<"
|
479
|
-
end
|
522
|
+
# if debug?
|
523
|
+
# print ">"
|
524
|
+
# print "*" * pos
|
525
|
+
# puts "#{line[pos..-1]}<"
|
526
|
+
# end
|
480
527
|
end
|
481
528
|
|
482
529
|
## check if no match in end of string
|
@@ -489,6 +536,24 @@ def _tokenize_line( line )
|
|
489
536
|
end
|
490
537
|
|
491
538
|
|
539
|
+
##
|
540
|
+
## if in prop mode continue if last token is [,-]
|
541
|
+
## otherwise change back to "standard" mode
|
542
|
+
if @re == PROP_RE
|
543
|
+
if [:',', :'-'].include?( tokens[-1][0] )
|
544
|
+
## continue/stay in PROP_RE mode
|
545
|
+
## todo/check - auto-add PROP_CONT token or such
|
546
|
+
## to help parser with possible NEWLINE
|
547
|
+
## conflicts - why? why not?
|
548
|
+
else
|
549
|
+
## switch back to top-level mode!!
|
550
|
+
puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
551
|
+
@re = RE
|
552
|
+
## note - auto-add PROP_END (<PROP_END>)
|
553
|
+
tokens << [:PROP_END, "<|PROP_END|>"]
|
554
|
+
end
|
555
|
+
end
|
556
|
+
|
492
557
|
[tokens,errors]
|
493
558
|
end
|
494
559
|
|