sportdb-parser 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -2
- data/Manifest.txt +1 -0
- data/lib/sportdb/parser/parser.rb +421 -232
- data/lib/sportdb/parser/token.rb +120 -207
- data/lib/sportdb/parser/tokenizer.rb +262 -0
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +64 -6
- metadata +3 -2
data/lib/sportdb/parser/token.rb
CHANGED
@@ -107,7 +107,7 @@ BASICS_RE = %r{
|
|
107
107
|
|
108
108
|
MINUTE_RE = %r{
|
109
109
|
(?<minute>
|
110
|
-
(?<=[ ]) # Positive lookbehind for space required
|
110
|
+
(?<=[ (]) # Positive lookbehind for space or opening ( e.g. (61') required
|
111
111
|
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
112
112
|
(?: \+
|
113
113
|
(?<value2>\d{1,3})
|
@@ -136,7 +136,125 @@ GOAL_OG_RE = %r{
|
|
136
136
|
|
137
137
|
|
138
138
|
|
139
|
-
|
139
|
+
|
140
|
+
|
141
|
+
PROP_BASICS_RE = %r{
|
142
|
+
(?<spaces> [ ]{2,}) |
|
143
|
+
(?<space> [ ])
|
144
|
+
|
|
145
|
+
(?<sym>[.;,\(\)\[\]-]) ## note - dot (.) is the (all-important) end-of-prop marker!!!
|
146
|
+
}ix
|
147
|
+
|
148
|
+
|
149
|
+
## name different from text (does not allow number in name/text)
|
150
|
+
##
|
151
|
+
## note - includes special handling for dot (.) if at the end of line!!!
|
152
|
+
## end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
|
153
|
+
|
154
|
+
PROP_NAME_RE = %r{
|
155
|
+
(?<prop_name> \b
|
156
|
+
(?<name>
|
157
|
+
\p{L}+
|
158
|
+
(?: \. (?: (?![ ]*$) )
|
159
|
+
)? ## edge case - check for end of prop marker! (e.g. Stop.)
|
160
|
+
(?:
|
161
|
+
[ ]? # only single spaces allowed inline!!!
|
162
|
+
(?:
|
163
|
+
(?:
|
164
|
+
(?<=\p{L}) ## use lookbehind
|
165
|
+
[/'-] ## must be surrounded by letters
|
166
|
+
## e.g. One/Two NOT
|
167
|
+
## One/ Two or One / Two or One /Two etc.
|
168
|
+
(?=\p{L}) ## use lookahead
|
169
|
+
)
|
170
|
+
|
|
171
|
+
(?:
|
172
|
+
(?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
|
173
|
+
['] ## must be surrounded by leading space and
|
174
|
+
## traling letters (e.g. UDI 'Beter Bed)
|
175
|
+
(?=\p{L}) ## use lookahead
|
176
|
+
)
|
177
|
+
|
|
178
|
+
(?:
|
179
|
+
(?<=\p{L}) ## use lookbehind
|
180
|
+
['] ## must be surrounded by leading letter and
|
181
|
+
## trailing space PLUS letter (e.g. UDI' Beter Bed)
|
182
|
+
(?=[ ]\p{L}) ## use lookahead (space WITH letter
|
183
|
+
)
|
184
|
+
|
|
185
|
+
(?: \p{L}+
|
186
|
+
(?: \.
|
187
|
+
(?: (?![ ]*$) )
|
188
|
+
)? ## last dot is delimiter!!!
|
189
|
+
)
|
190
|
+
)+
|
191
|
+
)*
|
192
|
+
)
|
193
|
+
## add lookahead - must be non-alphanum (or dot)
|
194
|
+
(?=[ .,;\]\)]|$)
|
195
|
+
)
|
196
|
+
}ix
|
197
|
+
|
198
|
+
|
199
|
+
|
200
|
+
|
201
|
+
##############
|
202
|
+
# add support for props/ attributes e.g.
|
203
|
+
#
|
204
|
+
# Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt – Andrich [Y] (46' Groß),
|
205
|
+
# Kroos (80' Can) – Musiala (74' Müller), Gündogan,
|
206
|
+
# Wirtz (63' Sane) – Havertz (63' Füllkrug).
|
207
|
+
# Scotland: Gunn – Porteous [R 44'], Hendry, Tierney (78' McKenna) – Ralston [Y],
|
208
|
+
# McTominay, McGregor (67' Gilmour), Robertson – Christie (82' Shankland),
|
209
|
+
# Adams (46' Hanley), McGinn (67' McLean).
|
210
|
+
#
|
211
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
212
|
+
## make sure mon feb 12 18:10 will not match
|
213
|
+
## allow 1. FC Köln etc.
|
214
|
+
## Mainz 05:
|
215
|
+
## limit to 30 chars max
|
216
|
+
## only allow chars incl. intl but (NOT ()[]/;)
|
217
|
+
|
218
|
+
|
219
|
+
PROP_KEY_RE = %r{
|
220
|
+
(?<prop_key> \b
|
221
|
+
(?<key>
|
222
|
+
(?:\p{L}+
|
223
|
+
|
|
224
|
+
\d+ # check for num lookahead (MUST be space or dot)
|
225
|
+
## MUST be followed by (optional dot) and
|
226
|
+
## required space !!!
|
227
|
+
## MUST be follow by a to z!!!!
|
228
|
+
\.? ## optional dot
|
229
|
+
[ ]? ## make space optional too - why? why not?
|
230
|
+
## yes - eg. 1st, 2nd, 5th etc.
|
231
|
+
\p{L}+
|
232
|
+
)
|
233
|
+
[\d\p{L}'/° -]*? ## allow almost anyting
|
234
|
+
## fix - add negative lookahead
|
235
|
+
## no space and dash etc.
|
236
|
+
## only allowed "inline" not at the end
|
237
|
+
## must end with latter or digit!
|
238
|
+
)
|
239
|
+
[ ]*? # slurp trailing spaces
|
240
|
+
:
|
241
|
+
(?=[ ]+) ## possitive lookahead (must be followed by space!!)
|
242
|
+
)
|
243
|
+
}ix
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
PROP_RE = Regexp.union(
|
249
|
+
PROP_BASICS_RE,
|
250
|
+
MINUTE_RE,
|
251
|
+
PROP_NAME_RE,
|
252
|
+
)
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
|
257
|
+
STATUS_RE,
|
140
258
|
TIMEZONE_RE,
|
141
259
|
TIME_RE,
|
142
260
|
DURATION_RE, # note - duration MUST match before date
|
@@ -147,210 +265,5 @@ RE = Regexp.union( STATUS_RE,
|
|
147
265
|
TEXT_RE )
|
148
266
|
|
149
267
|
|
150
|
-
def log( msg )
|
151
|
-
## append msg to ./logs.txt
|
152
|
-
## use ./errors.txt - why? why not?
|
153
|
-
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
154
|
-
f.write( msg )
|
155
|
-
f.write( "\n" )
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
def tokenize_with_errors( line, debug: false )
|
162
|
-
tokens = []
|
163
|
-
errors = [] ## keep a list of errors - why? why not?
|
164
|
-
|
165
|
-
puts ">#{line}<" if debug
|
166
|
-
|
167
|
-
pos = 0
|
168
|
-
## track last offsets - to report error on no match
|
169
|
-
## or no match in end of string
|
170
|
-
offsets = [0,0]
|
171
|
-
m = nil
|
172
|
-
|
173
|
-
while m = RE.match( line, pos )
|
174
|
-
if debug
|
175
|
-
pp m
|
176
|
-
puts "pos: #{pos}"
|
177
|
-
end
|
178
|
-
offsets = [m.begin(0), m.end(0)]
|
179
|
-
|
180
|
-
if offsets[0] != pos
|
181
|
-
## match NOT starting at start/begin position!!!
|
182
|
-
## report parse error!!!
|
183
|
-
msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
184
|
-
puts msg
|
185
|
-
|
186
|
-
errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
|
187
|
-
log( msg )
|
188
|
-
end
|
189
|
-
|
190
|
-
##
|
191
|
-
## todo/fix - also check if possible
|
192
|
-
## if no match but not yet end off string!!!!
|
193
|
-
## report skipped text run too!!!
|
194
|
-
|
195
|
-
pos = offsets[1]
|
196
|
-
|
197
|
-
pp offsets if debug
|
198
|
-
|
199
|
-
##
|
200
|
-
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
201
|
-
## for VAL use "text" or ["text", { opts }] array
|
202
|
-
|
203
|
-
t = if m[:space]
|
204
|
-
## skip space
|
205
|
-
nil
|
206
|
-
elsif m[:spaces]
|
207
|
-
## skip spaces
|
208
|
-
nil
|
209
|
-
elsif m[:text]
|
210
|
-
[:TEXT, m[:text]] ## keep pos - why? why not?
|
211
|
-
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
212
|
-
## todo/check - add text (or status)
|
213
|
-
# to opts hash {} by default (for value)
|
214
|
-
if m[:status_note] ## includes note? e.g. awarded; originally 2-0
|
215
|
-
[:STATUS, [m[:status], {status: m[:status],
|
216
|
-
note: m[:status_note]} ]]
|
217
|
-
else
|
218
|
-
[:STATUS, [m[:status], {status: m[:status] } ]]
|
219
|
-
end
|
220
|
-
elsif m[:time]
|
221
|
-
## unify to iso-format
|
222
|
-
### 12.40 => 12:40
|
223
|
-
## 12h40 => 12:40 etc.
|
224
|
-
## keep string (no time-only type in ruby)
|
225
|
-
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
226
|
-
minute = m[:minute].to_i(10)
|
227
|
-
## check if valid - 0:00 - 24:00
|
228
|
-
## check if 24:00 possible? or only 0:00 (23:59)
|
229
|
-
if (hour >= 0 && hour <= 24) &&
|
230
|
-
(minute >=0 && minute <= 59)
|
231
|
-
## note - for debugging keep (pass along) "literal" time
|
232
|
-
## might use/add support for am/pm later
|
233
|
-
[:TIME, [m[:time], {h:hour,m:minute}]]
|
234
|
-
else
|
235
|
-
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
236
|
-
end
|
237
|
-
elsif m[:date]
|
238
|
-
date = {}
|
239
|
-
=begin
|
240
|
-
((?<day_name>#{DAY_NAMES})
|
241
|
-
[ ]
|
242
|
-
)?
|
243
|
-
(?<month_name>#{MONTH_NAMES})
|
244
|
-
(?: \/|[ ] )
|
245
|
-
(?<day>\d{1,2})
|
246
|
-
## optional year
|
247
|
-
( [ ]
|
248
|
-
(?<year>\d{4})
|
249
|
-
)?
|
250
|
-
=end
|
251
|
-
## map month names
|
252
|
-
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
253
|
-
date[:y] = m[:year].to_i(10) if m[:year]
|
254
|
-
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
255
|
-
date[:d] = m[:day].to_i(10) if m[:day]
|
256
|
-
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
257
|
-
## note - for debugging keep (pass along) "literal" date
|
258
|
-
[:DATE, [m[:date], date]]
|
259
|
-
elsif m[:timezone]
|
260
|
-
[:TIMEZONE, m[:timezone]]
|
261
|
-
elsif m[:duration]
|
262
|
-
## todo/check/fix - if end: works for kwargs!!!!!
|
263
|
-
duration = { start: {}, end: {}}
|
264
|
-
duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
|
265
|
-
duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
|
266
|
-
duration[:start][:d] = m[:day1].to_i(10) if m[:day1]
|
267
|
-
duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1]
|
268
|
-
duration[:end][:y] = m[:year2].to_i(10) if m[:year2]
|
269
|
-
duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2]
|
270
|
-
duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
|
271
|
-
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
272
|
-
## note - for debugging keep (pass along) "literal" duration
|
273
|
-
[:DURATION, [m[:duration], duration]]
|
274
|
-
elsif m[:num] ## fix - change to ord (for ordinal number!!!)
|
275
|
-
## note - strip enclosing () and convert to integer
|
276
|
-
[:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
|
277
|
-
elsif m[:score]
|
278
|
-
score = {}
|
279
|
-
## check for pen
|
280
|
-
score[:p] = [m[:p1].to_i(10),
|
281
|
-
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
282
|
-
score[:et] = [m[:et1].to_i(10),
|
283
|
-
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
284
|
-
score[:ft] = [m[:ft1].to_i(10),
|
285
|
-
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
286
|
-
score[:ht] = [m[:ht1].to_i(10),
|
287
|
-
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
288
|
-
|
289
|
-
## note - for debugging keep (pass along) "literal" score
|
290
|
-
[:SCORE, [m[:score], score]]
|
291
|
-
elsif m[:minute]
|
292
|
-
minute = {}
|
293
|
-
minute[:m] = m[:value].to_i(10)
|
294
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
295
|
-
## note - for debugging keep (pass along) "literal" minute
|
296
|
-
[:MINUTE, [m[:minute], minute]]
|
297
|
-
elsif m[:og]
|
298
|
-
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
299
|
-
elsif m[:pen]
|
300
|
-
[:PEN, m[:pen]]
|
301
|
-
elsif m[:vs]
|
302
|
-
[:VS, m[:vs]]
|
303
|
-
elsif m[:sym]
|
304
|
-
sym = m[:sym]
|
305
|
-
## return symbols "inline" as is - why? why not?
|
306
|
-
## (?<sym>[;,@|\[\]-])
|
307
|
-
|
308
|
-
case sym
|
309
|
-
when ',' then [:',']
|
310
|
-
when ';' then [:';']
|
311
|
-
when '@' then [:'@']
|
312
|
-
when '|' then [:'|']
|
313
|
-
when '[' then [:'[']
|
314
|
-
when ']' then [:']']
|
315
|
-
when '-' then [:'-']
|
316
|
-
else
|
317
|
-
nil ## ignore others (e.g. brackets [])
|
318
|
-
end
|
319
|
-
else
|
320
|
-
## report error
|
321
|
-
puts "!!! TOKENIZE ERROR - no match found"
|
322
|
-
nil
|
323
|
-
end
|
324
|
-
|
325
|
-
tokens << t if t
|
326
|
-
|
327
|
-
if debug
|
328
|
-
print ">"
|
329
|
-
print "*" * pos
|
330
|
-
puts "#{line[pos..-1]}<"
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
## check if no match in end of string
|
335
|
-
if offsets[1] != line.size
|
336
|
-
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
337
|
-
puts msg
|
338
|
-
log( msg )
|
339
|
-
|
340
|
-
errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
|
341
|
-
end
|
342
|
-
|
343
|
-
|
344
|
-
[tokens,errors]
|
345
|
-
end
|
346
|
-
|
347
|
-
|
348
|
-
### convience helper - ignore errors by default
|
349
|
-
def tokenize( line, debug: false )
|
350
|
-
tokens, _ = tokenize_with_errors( line, debug: debug )
|
351
|
-
tokens
|
352
|
-
end
|
353
|
-
|
354
|
-
|
355
268
|
end # class Parser
|
356
269
|
end # module SportDb
|
@@ -0,0 +1,262 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
def log( msg )
|
8
|
+
## append msg to ./logs.txt
|
9
|
+
## use ./errors.txt - why? why not?
|
10
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
11
|
+
f.write( msg )
|
12
|
+
f.write( "\n" )
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def tokenize_with_errors( line, debug: false )
|
19
|
+
tokens = []
|
20
|
+
errors = [] ## keep a list of errors - why? why not?
|
21
|
+
|
22
|
+
puts ">#{line}<" if debug
|
23
|
+
|
24
|
+
pos = 0
|
25
|
+
## track last offsets - to report error on no match
|
26
|
+
## or no match in end of string
|
27
|
+
offsets = [0,0]
|
28
|
+
m = nil
|
29
|
+
|
30
|
+
|
31
|
+
####
|
32
|
+
## quick hack - keep re state/mode between tokenize calls!!!
|
33
|
+
@re ||= RE ## note - switch between RE & INSIDE_RE
|
34
|
+
|
35
|
+
|
36
|
+
while m = @re.match( line, pos )
|
37
|
+
if debug
|
38
|
+
pp m
|
39
|
+
puts "pos: #{pos}"
|
40
|
+
end
|
41
|
+
offsets = [m.begin(0), m.end(0)]
|
42
|
+
|
43
|
+
if offsets[0] != pos
|
44
|
+
## match NOT starting at start/begin position!!!
|
45
|
+
## report parse error!!!
|
46
|
+
msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
47
|
+
puts msg
|
48
|
+
|
49
|
+
errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
|
50
|
+
log( msg )
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
## todo/fix - also check if possible
|
55
|
+
## if no match but not yet end off string!!!!
|
56
|
+
## report skipped text run too!!!
|
57
|
+
|
58
|
+
pos = offsets[1]
|
59
|
+
|
60
|
+
pp offsets if debug
|
61
|
+
|
62
|
+
##
|
63
|
+
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
64
|
+
## for VAL use "text" or ["text", { opts }] array
|
65
|
+
|
66
|
+
|
67
|
+
t = if @re == PROP_RE
|
68
|
+
if m[:space]
|
69
|
+
## skip space
|
70
|
+
nil
|
71
|
+
elsif m[:spaces]
|
72
|
+
## skip spaces
|
73
|
+
nil
|
74
|
+
elsif m[:prop_name]
|
75
|
+
if m[:name] == 'Y'
|
76
|
+
[:YELLOW_CARD, m[:name]]
|
77
|
+
elsif m[:name] == 'R'
|
78
|
+
[:RED_CARD, m[:name]]
|
79
|
+
else
|
80
|
+
[:PROP_NAME, m[:name]]
|
81
|
+
end
|
82
|
+
elsif m[:minute]
|
83
|
+
minute = {}
|
84
|
+
minute[:m] = m[:value].to_i(10)
|
85
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
86
|
+
## note - for debugging keep (pass along) "literal" minute
|
87
|
+
[:MINUTE, [m[:minute], minute]]
|
88
|
+
elsif m[:sym]
|
89
|
+
sym = m[:sym]
|
90
|
+
## return symbols "inline" as is - why? why not?
|
91
|
+
## (?<sym>[;,@|\[\]-])
|
92
|
+
|
93
|
+
case sym
|
94
|
+
when ',' then [:',']
|
95
|
+
when ';' then [:';']
|
96
|
+
when '[' then [:'[']
|
97
|
+
when ']' then [:']']
|
98
|
+
when '(' then [:'(']
|
99
|
+
when ')' then [:')']
|
100
|
+
when '-' then [:'-']
|
101
|
+
when '.' then
|
102
|
+
## switch back to top-level mode!!
|
103
|
+
puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"
|
104
|
+
@re = RE
|
105
|
+
[:'.']
|
106
|
+
else
|
107
|
+
nil ## ignore others (e.g. brackets [])
|
108
|
+
end
|
109
|
+
else
|
110
|
+
## report error
|
111
|
+
puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
|
112
|
+
nil
|
113
|
+
end
|
114
|
+
else ## assume TOP_LEVEL (a.k.a. RE) machinery
|
115
|
+
if m[:space]
|
116
|
+
## skip space
|
117
|
+
nil
|
118
|
+
elsif m[:spaces]
|
119
|
+
## skip spaces
|
120
|
+
nil
|
121
|
+
elsif m[:prop_key]
|
122
|
+
## switch context to PROP_RE
|
123
|
+
@re = PROP_RE
|
124
|
+
puts " ENTER PROP_RE MODE"
|
125
|
+
[:PROP, m[:key]]
|
126
|
+
elsif m[:text]
|
127
|
+
[:TEXT, m[:text]] ## keep pos - why? why not?
|
128
|
+
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
129
|
+
## todo/check - add text (or status)
|
130
|
+
# to opts hash {} by default (for value)
|
131
|
+
if m[:status_note] ## includes note? e.g. awarded; originally 2-0
|
132
|
+
[:STATUS, [m[:status], {status: m[:status],
|
133
|
+
note: m[:status_note]} ]]
|
134
|
+
else
|
135
|
+
[:STATUS, [m[:status], {status: m[:status] } ]]
|
136
|
+
end
|
137
|
+
elsif m[:time]
|
138
|
+
## unify to iso-format
|
139
|
+
### 12.40 => 12:40
|
140
|
+
## 12h40 => 12:40 etc.
|
141
|
+
## keep string (no time-only type in ruby)
|
142
|
+
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
143
|
+
minute = m[:minute].to_i(10)
|
144
|
+
## check if valid - 0:00 - 24:00
|
145
|
+
## check if 24:00 possible? or only 0:00 (23:59)
|
146
|
+
if (hour >= 0 && hour <= 24) &&
|
147
|
+
(minute >=0 && minute <= 59)
|
148
|
+
## note - for debugging keep (pass along) "literal" time
|
149
|
+
## might use/add support for am/pm later
|
150
|
+
[:TIME, [m[:time], {h:hour,m:minute}]]
|
151
|
+
else
|
152
|
+
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
153
|
+
end
|
154
|
+
elsif m[:date]
|
155
|
+
date = {}
|
156
|
+
## map month names
|
157
|
+
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
158
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
159
|
+
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
160
|
+
date[:d] = m[:day].to_i(10) if m[:day]
|
161
|
+
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
162
|
+
## note - for debugging keep (pass along) "literal" date
|
163
|
+
[:DATE, [m[:date], date]]
|
164
|
+
elsif m[:timezone]
|
165
|
+
[:TIMEZONE, m[:timezone]]
|
166
|
+
elsif m[:duration]
|
167
|
+
## todo/check/fix - if end: works for kwargs!!!!!
|
168
|
+
duration = { start: {}, end: {}}
|
169
|
+
duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
|
170
|
+
duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
|
171
|
+
duration[:start][:d] = m[:day1].to_i(10) if m[:day1]
|
172
|
+
duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1]
|
173
|
+
duration[:end][:y] = m[:year2].to_i(10) if m[:year2]
|
174
|
+
duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2]
|
175
|
+
duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
|
176
|
+
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
177
|
+
## note - for debugging keep (pass along) "literal" duration
|
178
|
+
[:DURATION, [m[:duration], duration]]
|
179
|
+
elsif m[:num] ## fix - change to ord (for ordinal number!!!)
|
180
|
+
## note - strip enclosing () and convert to integer
|
181
|
+
[:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
|
182
|
+
elsif m[:score]
|
183
|
+
score = {}
|
184
|
+
## check for pen
|
185
|
+
score[:p] = [m[:p1].to_i(10),
|
186
|
+
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
187
|
+
score[:et] = [m[:et1].to_i(10),
|
188
|
+
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
189
|
+
score[:ft] = [m[:ft1].to_i(10),
|
190
|
+
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
191
|
+
score[:ht] = [m[:ht1].to_i(10),
|
192
|
+
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
193
|
+
|
194
|
+
## note - for debugging keep (pass along) "literal" score
|
195
|
+
[:SCORE, [m[:score], score]]
|
196
|
+
elsif m[:minute]
|
197
|
+
minute = {}
|
198
|
+
minute[:m] = m[:value].to_i(10)
|
199
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
200
|
+
## note - for debugging keep (pass along) "literal" minute
|
201
|
+
[:MINUTE, [m[:minute], minute]]
|
202
|
+
elsif m[:og]
|
203
|
+
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
204
|
+
elsif m[:pen]
|
205
|
+
[:PEN, m[:pen]]
|
206
|
+
elsif m[:vs]
|
207
|
+
[:VS, m[:vs]]
|
208
|
+
elsif m[:sym]
|
209
|
+
sym = m[:sym]
|
210
|
+
## return symbols "inline" as is - why? why not?
|
211
|
+
## (?<sym>[;,@|\[\]-])
|
212
|
+
|
213
|
+
case sym
|
214
|
+
when ',' then [:',']
|
215
|
+
when ';' then [:';']
|
216
|
+
when '@' then [:'@']
|
217
|
+
when '|' then [:'|']
|
218
|
+
when '[' then [:'[']
|
219
|
+
when ']' then [:']']
|
220
|
+
when '-' then [:'-']
|
221
|
+
else
|
222
|
+
nil ## ignore others (e.g. brackets [])
|
223
|
+
end
|
224
|
+
else
|
225
|
+
## report error
|
226
|
+
puts "!!! TOKENIZE ERROR - no match found"
|
227
|
+
nil
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
|
232
|
+
tokens << t if t
|
233
|
+
|
234
|
+
if debug
|
235
|
+
print ">"
|
236
|
+
print "*" * pos
|
237
|
+
puts "#{line[pos..-1]}<"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
## check if no match in end of string
|
242
|
+
if offsets[1] != line.size
|
243
|
+
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
244
|
+
puts msg
|
245
|
+
log( msg )
|
246
|
+
|
247
|
+
errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
|
248
|
+
end
|
249
|
+
|
250
|
+
|
251
|
+
[tokens,errors]
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
### convience helper - ignore errors by default
|
256
|
+
def tokenize( line, debug: false )
|
257
|
+
tokens, _ = tokenize_with_errors( line, debug: debug )
|
258
|
+
tokens
|
259
|
+
end
|
260
|
+
|
261
|
+
end # class Parser
|
262
|
+
end # module SportDb
|