sportdb-parser 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/lib/sportdb/parser/lexer.rb +63 -10
- data/lib/sportdb/parser/parser.rb +521 -404
- data/lib/sportdb/parser/racc_parser.rb +4 -2
- data/lib/sportdb/parser/token-date.rb +66 -15
- data/lib/sportdb/parser/token-minute.rb +19 -4
- data/lib/sportdb/parser/token-score.rb +25 -14
- data/lib/sportdb/parser/token-status.rb +109 -0
- data/lib/sportdb/parser/token.rb +13 -2
- data/lib/sportdb/parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85526406c8cd97a5b4e8580e64597b60f2046f4667a97080434238e067be2788
|
4
|
+
data.tar.gz: dcd5e6aaa854654974644c026fb99545c31ef2d5929d0518d8418630b5d6ea76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc086846a66d2657d3debae5562fe20fbd2327741c8af1485972dfd0b8f46b3c649c0345ba173f2d3f40622bd4bddecc7ea0072d4d129bc5dc542554c539ebab
|
7
|
+
data.tar.gz: c0c4653cb40cb89e8086b6dc00ed853c62abf75fc18972cf98230a779cd8a7f73d797098ec53c2e942e6271c00c9d11a1da8f6975704141cbfbba599ec741098
|
data/CHANGELOG.md
CHANGED
data/lib/sportdb/parser/lexer.rb
CHANGED
@@ -147,12 +147,15 @@ def initialize( lines, debug: false )
|
|
147
147
|
## strip lines with comments and empty lines striped / removed
|
148
148
|
## keep empty lines? why? why not?
|
149
149
|
## keep leading spaces (indent) - why?
|
150
|
+
##
|
151
|
+
## note - KEEP empty lines (get turned into BLANK token!!!!)
|
152
|
+
|
150
153
|
@txt = String.new
|
151
154
|
txt_pre.each_line do |line| ## preprocess
|
152
155
|
line = line.strip
|
153
|
-
next if line.
|
156
|
+
next if line.start_with?('#') ### skip comments
|
154
157
|
|
155
|
-
line = line.sub( /#.*/, '' ).strip
|
158
|
+
line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
|
156
159
|
|
157
160
|
@txt << line
|
158
161
|
@txt << "\n"
|
@@ -193,6 +196,18 @@ def tokenize_with_errors
|
|
193
196
|
t
|
194
197
|
end
|
195
198
|
|
199
|
+
### check for "section" starters e.g. Teams or such
|
200
|
+
t = tokens[0]
|
201
|
+
if t[0] == :TEXT
|
202
|
+
text = t[1]
|
203
|
+
if text =~ /^teams$/i
|
204
|
+
t[0] = :TEAMS
|
205
|
+
elsif text =~ /^blank$/i ### todo/fix -- remove!!! add real blanks!!
|
206
|
+
t[0] = :BLANK
|
207
|
+
else
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
196
211
|
#################
|
197
212
|
## pass 2
|
198
213
|
## transform tokens (using simple patterns)
|
@@ -227,13 +242,22 @@ def tokenize_with_errors
|
|
227
242
|
end
|
228
243
|
|
229
244
|
|
230
|
-
if buf.match?( :TEXT, [:SCORE, :VS, :'-'], :TEXT )
|
245
|
+
if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT )
|
231
246
|
nodes << [:TEAM, buf.next[1]]
|
232
247
|
nodes << buf.next
|
233
248
|
nodes << [:TEAM, buf.next[1]]
|
234
249
|
elsif buf.match?( :TEXT, :MINUTE )
|
235
250
|
nodes << [:PLAYER, buf.next[1]]
|
236
251
|
nodes << buf.next
|
252
|
+
elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
253
|
+
date = buf.next[1]
|
254
|
+
time = buf.next[1]
|
255
|
+
## puts "DATETIME:"
|
256
|
+
## pp date, time
|
257
|
+
val = [date[0] + ' ' + time[0], ## concat string of two tokens
|
258
|
+
{ date: date[1], time: time[1] }
|
259
|
+
]
|
260
|
+
nodes << [:DATETIME, val]
|
237
261
|
else
|
238
262
|
## pass through
|
239
263
|
nodes << buf.next
|
@@ -253,7 +277,8 @@ def tokenize_with_errors
|
|
253
277
|
end
|
254
278
|
|
255
279
|
tokens += tok
|
256
|
-
|
280
|
+
## auto-add newlines (unless BLANK!!)
|
281
|
+
tokens << [:NEWLINE, "\n"] unless tok[0][0] == :BLANK
|
257
282
|
end
|
258
283
|
|
259
284
|
[tokens,errors]
|
@@ -267,6 +292,17 @@ def _tokenize_line( line )
|
|
267
292
|
|
268
293
|
puts "line: >#{line}<" if debug?
|
269
294
|
|
295
|
+
|
296
|
+
### special case for empty line (aka BLANK)
|
297
|
+
if line.empty?
|
298
|
+
## note - blank always resets parser mode to std/top-level!!!
|
299
|
+
@re = RE
|
300
|
+
|
301
|
+
tokens << [:BLANK, '<|BLANK|>']
|
302
|
+
return [tokens, errors]
|
303
|
+
end
|
304
|
+
|
305
|
+
|
270
306
|
pos = 0
|
271
307
|
## track last offsets - to report error on no match
|
272
308
|
## or no match in end of string
|
@@ -353,7 +389,9 @@ def _tokenize_line( line )
|
|
353
389
|
puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
|
354
390
|
nil
|
355
391
|
end
|
356
|
-
|
392
|
+
###################################################
|
393
|
+
## assume TOP_LEVEL (a.k.a. RE) machinery
|
394
|
+
else
|
357
395
|
if m[:space] || m[:spaces]
|
358
396
|
nil ## skip space(s)
|
359
397
|
elsif m[:prop_key]
|
@@ -372,6 +410,11 @@ def _tokenize_line( line )
|
|
372
410
|
else
|
373
411
|
[:STATUS, [m[:status], {status: m[:status] } ]]
|
374
412
|
end
|
413
|
+
elsif m[:note]
|
414
|
+
### todo/check:
|
415
|
+
## use value hash - why? why not? or simplify to:
|
416
|
+
## [:NOTE, m[:note]]
|
417
|
+
[:NOTE, [m[:note], {note: m[:note] } ]]
|
375
418
|
elsif m[:time]
|
376
419
|
## unify to iso-format
|
377
420
|
### 12.40 => 12:40
|
@@ -420,7 +463,7 @@ def _tokenize_line( line )
|
|
420
463
|
elsif m[:num] ## fix - change to ord (for ordinal number!!!)
|
421
464
|
## note - strip enclosing () and convert to integer
|
422
465
|
[:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
|
423
|
-
elsif m[:
|
466
|
+
elsif m[:score_more]
|
424
467
|
score = {}
|
425
468
|
## check for pen
|
426
469
|
score[:p] = [m[:p1].to_i(10),
|
@@ -433,8 +476,15 @@ def _tokenize_line( line )
|
|
433
476
|
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
434
477
|
|
435
478
|
## note - for debugging keep (pass along) "literal" score
|
436
|
-
[:
|
437
|
-
elsif m[:
|
479
|
+
[:SCORE_MORE, [m[:score_more], score]]
|
480
|
+
elsif m[:score]
|
481
|
+
score = {}
|
482
|
+
## must always have ft for now e.g. 1-1 or such
|
483
|
+
score[:ft] = [m[:ft1].to_i(10),
|
484
|
+
m[:ft2].to_i(10)]
|
485
|
+
## note - for debugging keep (pass along) "literal" score
|
486
|
+
[:SCORE, [m[:score], score]]
|
487
|
+
elsif m[:minute]
|
438
488
|
minute = {}
|
439
489
|
minute[:m] = m[:value].to_i(10)
|
440
490
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
@@ -459,7 +509,10 @@ def _tokenize_line( line )
|
|
459
509
|
when '|' then [:'|']
|
460
510
|
when '[' then [:'[']
|
461
511
|
when ']' then [:']']
|
462
|
-
when '-' then [:'-']
|
512
|
+
when '-' then [:'-'] # level 1 OR (classic) dash
|
513
|
+
when '--' then [:'--'] # level 2
|
514
|
+
when '---' then [:'---'] # level 3
|
515
|
+
when '----' then [:'----'] # level 4
|
463
516
|
else
|
464
517
|
nil ## ignore others (e.g. brackets [])
|
465
518
|
end
|
@@ -504,7 +557,7 @@ def _tokenize_line( line )
|
|
504
557
|
puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
505
558
|
@re = RE
|
506
559
|
## note - auto-add PROP_END (<PROP_END>)
|
507
|
-
tokens << [:PROP_END, "
|
560
|
+
tokens << [:PROP_END, "<|PROP_END|>"]
|
508
561
|
end
|
509
562
|
end
|
510
563
|
|