sportdb-parser 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -2
- data/Manifest.txt +1 -0
- data/README.md +0 -5
- data/Rakefile +1 -0
- data/lib/sportdb/parser/parser.rb +817 -209
- data/lib/sportdb/parser/token-text.rb +1 -1
- data/lib/sportdb/parser/token.rb +146 -231
- data/lib/sportdb/parser/tokenizer.rb +262 -0
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +296 -0
- metadata +17 -2
data/lib/sportdb/parser/token.rb
CHANGED
@@ -68,29 +68,46 @@ BASICS_RE = %r{
|
|
68
68
|
(?<vs>
|
69
69
|
(?<=[ ]) # Positive lookbehind for space
|
70
70
|
(?:
|
71
|
-
vs
|
72
|
-
|
73
|
-
|
74
|
-
|
71
|
+
vs|v
|
72
|
+
)
|
73
|
+
# not bigger match first e.g. vs than v etc.
|
74
|
+
# todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
|
75
75
|
(?=[ ]) # positive lookahead for space
|
76
76
|
)
|
77
77
|
|
|
78
|
+
(?<spaces> [ ]{2,}) |
|
79
|
+
(?<space> [ ])
|
80
|
+
|
|
81
|
+
(?<sym>[;,@|\[\]-])
|
82
|
+
}ix
|
83
|
+
|
84
|
+
|
85
|
+
## removed from basics
|
86
|
+
=begin
|
78
87
|
(?<none>
|
79
88
|
(?<=[ \[]|^) # Positive lookbehind for space or [
|
80
89
|
-
|
81
90
|
(?=[ ]*;) # positive lookahead for space
|
82
91
|
)
|
83
92
|
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
93
|
+
(?<vs>
|
94
|
+
(?<=[ ]) # Positive lookbehind for space
|
95
|
+
(?:
|
96
|
+
vs\.?| ## allow optional dot (eg. vs. v.)
|
97
|
+
v\.?|
|
98
|
+
-
|
99
|
+
) # not bigger match first e.g. vs than v etc.
|
100
|
+
(?=[ ]) # positive lookahead for space
|
101
|
+
)
|
102
|
+
|
|
103
|
+
|
104
|
+
make - into a simple symbol !!!
|
105
|
+
=end
|
89
106
|
|
90
107
|
|
91
108
|
MINUTE_RE = %r{
|
92
109
|
(?<minute>
|
93
|
-
(?<=[ ]) # Positive lookbehind for space required
|
110
|
+
(?<=[ (]) # Positive lookbehind for space or opening ( e.g. (61') required
|
94
111
|
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
95
112
|
(?: \+
|
96
113
|
(?<value2>\d{1,3})
|
@@ -119,7 +136,125 @@ GOAL_OG_RE = %r{
|
|
119
136
|
|
120
137
|
|
121
138
|
|
122
|
-
|
139
|
+
|
140
|
+
|
141
|
+
PROP_BASICS_RE = %r{
|
142
|
+
(?<spaces> [ ]{2,}) |
|
143
|
+
(?<space> [ ])
|
144
|
+
|
|
145
|
+
(?<sym>[.;,\(\)\[\]-]) ## note - dot (.) is the (all-important) end-of-prop marker!!!
|
146
|
+
}ix
|
147
|
+
|
148
|
+
|
149
|
+
## name different from text (does not allow number in name/text)
|
150
|
+
##
|
151
|
+
## note - includes special handling for dot (.) if at the end of line!!!
|
152
|
+
## end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
|
153
|
+
|
154
|
+
PROP_NAME_RE = %r{
|
155
|
+
(?<prop_name> \b
|
156
|
+
(?<name>
|
157
|
+
\p{L}+
|
158
|
+
(?: \. (?: (?![ ]*$) )
|
159
|
+
)? ## edge case - check for end of prop marker! (e.g. Stop.)
|
160
|
+
(?:
|
161
|
+
[ ]? # only single spaces allowed inline!!!
|
162
|
+
(?:
|
163
|
+
(?:
|
164
|
+
(?<=\p{L}) ## use lookbehind
|
165
|
+
[/'-] ## must be surrounded by letters
|
166
|
+
## e.g. One/Two NOT
|
167
|
+
## One/ Two or One / Two or One /Two etc.
|
168
|
+
(?=\p{L}) ## use lookahead
|
169
|
+
)
|
170
|
+
|
|
171
|
+
(?:
|
172
|
+
(?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
|
173
|
+
['] ## must be surrounded by leading space and
|
174
|
+
## traling letters (e.g. UDI 'Beter Bed)
|
175
|
+
(?=\p{L}) ## use lookahead
|
176
|
+
)
|
177
|
+
|
|
178
|
+
(?:
|
179
|
+
(?<=\p{L}) ## use lookbehind
|
180
|
+
['] ## must be surrounded by leading letter and
|
181
|
+
## trailing space PLUS letter (e.g. UDI' Beter Bed)
|
182
|
+
(?=[ ]\p{L}) ## use lookahead (space WITH letter
|
183
|
+
)
|
184
|
+
|
|
185
|
+
(?: \p{L}+
|
186
|
+
(?: \.
|
187
|
+
(?: (?![ ]*$) )
|
188
|
+
)? ## last dot is delimiter!!!
|
189
|
+
)
|
190
|
+
)+
|
191
|
+
)*
|
192
|
+
)
|
193
|
+
## add lookahead - must be non-alphanum (or dot)
|
194
|
+
(?=[ .,;\]\)]|$)
|
195
|
+
)
|
196
|
+
}ix
|
197
|
+
|
198
|
+
|
199
|
+
|
200
|
+
|
201
|
+
##############
|
202
|
+
# add support for props/ attributes e.g.
|
203
|
+
#
|
204
|
+
# Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt – Andrich [Y] (46' Groß),
|
205
|
+
# Kroos (80' Can) – Musiala (74' Müller), Gündogan,
|
206
|
+
# Wirtz (63' Sane) – Havertz (63' Füllkrug).
|
207
|
+
# Scotland: Gunn – Porteous [R 44'], Hendry, Tierney (78' McKenna) – Ralston [Y],
|
208
|
+
# McTominay, McGregor (67' Gilmour), Robertson – Christie (82' Shankland),
|
209
|
+
# Adams (46' Hanley), McGinn (67' McLean).
|
210
|
+
#
|
211
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
212
|
+
## make sure mon feb 12 18:10 will not match
|
213
|
+
## allow 1. FC Köln etc.
|
214
|
+
## Mainz 05:
|
215
|
+
## limit to 30 chars max
|
216
|
+
## only allow chars incl. intl but (NOT ()[]/;)
|
217
|
+
|
218
|
+
|
219
|
+
PROP_KEY_RE = %r{
|
220
|
+
(?<prop_key> \b
|
221
|
+
(?<key>
|
222
|
+
(?:\p{L}+
|
223
|
+
|
|
224
|
+
\d+ # check for num lookahead (MUST be space or dot)
|
225
|
+
## MUST be followed by (optional dot) and
|
226
|
+
## required space !!!
|
227
|
+
## MUST be follow by a to z!!!!
|
228
|
+
\.? ## optional dot
|
229
|
+
[ ]? ## make space optional too - why? why not?
|
230
|
+
## yes - eg. 1st, 2nd, 5th etc.
|
231
|
+
\p{L}+
|
232
|
+
)
|
233
|
+
[\d\p{L}'/° -]*? ## allow almost anyting
|
234
|
+
## fix - add negative lookahead
|
235
|
+
## no space and dash etc.
|
236
|
+
## only allowed "inline" not at the end
|
237
|
+
## must end with latter or digit!
|
238
|
+
)
|
239
|
+
[ ]*? # slurp trailing spaces
|
240
|
+
:
|
241
|
+
(?=[ ]+) ## possitive lookahead (must be followed by space!!)
|
242
|
+
)
|
243
|
+
}ix
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
PROP_RE = Regexp.union(
|
249
|
+
PROP_BASICS_RE,
|
250
|
+
MINUTE_RE,
|
251
|
+
PROP_NAME_RE,
|
252
|
+
)
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
|
257
|
+
STATUS_RE,
|
123
258
|
TIMEZONE_RE,
|
124
259
|
TIME_RE,
|
125
260
|
DURATION_RE, # note - duration MUST match before date
|
@@ -130,225 +265,5 @@ RE = Regexp.union( STATUS_RE,
|
|
130
265
|
TEXT_RE )
|
131
266
|
|
132
267
|
|
133
|
-
def log( msg )
|
134
|
-
## append msg to ./logs.txt
|
135
|
-
## use ./errors.txt - why? why not?
|
136
|
-
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
137
|
-
f.write( msg )
|
138
|
-
f.write( "\n" )
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
def tokenize_with_errors( line, typed: false,
|
145
|
-
debug: false )
|
146
|
-
tokens = []
|
147
|
-
errors = [] ## keep a list of errors - why? why not?
|
148
|
-
|
149
|
-
puts ">#{line}<" if debug
|
150
|
-
|
151
|
-
pos = 0
|
152
|
-
## track last offsets - to report error on no match
|
153
|
-
## or no match in end of string
|
154
|
-
offsets = [0,0]
|
155
|
-
m = nil
|
156
|
-
|
157
|
-
while m = RE.match( line, pos )
|
158
|
-
if debug
|
159
|
-
pp m
|
160
|
-
puts "pos: #{pos}"
|
161
|
-
end
|
162
|
-
offsets = [m.begin(0), m.end(0)]
|
163
|
-
|
164
|
-
if offsets[0] != pos
|
165
|
-
## match NOT starting at start/begin position!!!
|
166
|
-
## report parse error!!!
|
167
|
-
msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
168
|
-
puts msg
|
169
|
-
|
170
|
-
errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
|
171
|
-
log( msg )
|
172
|
-
end
|
173
|
-
|
174
|
-
##
|
175
|
-
## todo/fix - also check if possible
|
176
|
-
## if no match but not yet end off string!!!!
|
177
|
-
## report skipped text run too!!!
|
178
|
-
|
179
|
-
pos = offsets[1]
|
180
|
-
|
181
|
-
pp offsets if debug
|
182
|
-
|
183
|
-
t = if m[:space]
|
184
|
-
## skip space
|
185
|
-
nil
|
186
|
-
elsif m[:spaces]
|
187
|
-
## skip spaces
|
188
|
-
nil
|
189
|
-
elsif m[:text]
|
190
|
-
[:text, m[:text]] ## keep pos - why? why not?
|
191
|
-
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
192
|
-
if m[:status_note] ## includes note? e.g. awarded; originally 2-0
|
193
|
-
[:status, m[:status], {note:m[:status_note]}]
|
194
|
-
else
|
195
|
-
[:status, m[:status]]
|
196
|
-
end
|
197
|
-
elsif m[:time]
|
198
|
-
if typed
|
199
|
-
## unify to iso-format
|
200
|
-
### 12.40 => 12:40
|
201
|
-
## 12h40 => 12:40 etc.
|
202
|
-
## keep string (no time-only type in ruby)
|
203
|
-
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
204
|
-
minute = m[:minute].to_i(10)
|
205
|
-
## check if valid - 0:00 - 24:00
|
206
|
-
## check if 24:00 possible? or only 0:00 (23:59)
|
207
|
-
if (hour >= 0 && hour <= 24) &&
|
208
|
-
(minute >=0 && minute <= 59)
|
209
|
-
## note - for debugging keep (pass along) "literal" time
|
210
|
-
## might use/add support for am/pm later
|
211
|
-
[:time, m[:time], {h:hour,m:minute}]
|
212
|
-
else
|
213
|
-
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
214
|
-
end
|
215
|
-
else
|
216
|
-
[:time, m[:time]]
|
217
|
-
end
|
218
|
-
elsif m[:date]
|
219
|
-
if typed
|
220
|
-
date = {}
|
221
|
-
=begin
|
222
|
-
((?<day_name>#{DAY_NAMES})
|
223
|
-
[ ]
|
224
|
-
)?
|
225
|
-
(?<month_name>#{MONTH_NAMES})
|
226
|
-
(?: \/|[ ] )
|
227
|
-
(?<day>\d{1,2})
|
228
|
-
## optional year
|
229
|
-
( [ ]
|
230
|
-
(?<year>\d{4})
|
231
|
-
)?
|
232
|
-
=end
|
233
|
-
## map month names
|
234
|
-
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
235
|
-
date[:y] = m[:year].to_i(10) if m[:year]
|
236
|
-
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
237
|
-
date[:d] = m[:day].to_i(10) if m[:day]
|
238
|
-
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
239
|
-
## note - for debugging keep (pass along) "literal" date
|
240
|
-
[:date, m[:date], date]
|
241
|
-
else
|
242
|
-
[:date, m[:date]]
|
243
|
-
end
|
244
|
-
elsif m[:timezone]
|
245
|
-
[:timezone, m[:timezone]]
|
246
|
-
elsif m[:duration]
|
247
|
-
if typed
|
248
|
-
duration = { start: {}, end: {}}
|
249
|
-
duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
|
250
|
-
duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
|
251
|
-
duration[:start][:d] = m[:day1].to_i(10) if m[:day1]
|
252
|
-
duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1]
|
253
|
-
duration[:end][:y] = m[:year2].to_i(10) if m[:year2]
|
254
|
-
duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2]
|
255
|
-
duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
|
256
|
-
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
257
|
-
## note - for debugging keep (pass along) "literal" duration
|
258
|
-
[:duration, m[:duration], duration]
|
259
|
-
else
|
260
|
-
[:duration, m[:duration]]
|
261
|
-
end
|
262
|
-
elsif m[:num]
|
263
|
-
if typed
|
264
|
-
## note - strip enclosing () and convert to integer
|
265
|
-
[:num, m[:value].to_i(10)]
|
266
|
-
else
|
267
|
-
[:num, m[:num]]
|
268
|
-
end
|
269
|
-
elsif m[:score]
|
270
|
-
if typed
|
271
|
-
score = {}
|
272
|
-
## check for pen
|
273
|
-
score[:p] = [m[:p1].to_i(10),
|
274
|
-
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
275
|
-
score[:et] = [m[:et1].to_i(10),
|
276
|
-
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
277
|
-
score[:ft] = [m[:ft1].to_i(10),
|
278
|
-
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
279
|
-
score[:ht] = [m[:ht1].to_i(10),
|
280
|
-
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
281
|
-
|
282
|
-
## note - for debugging keep (pass along) "literal" score
|
283
|
-
[:score, m[:score], score]
|
284
|
-
else
|
285
|
-
[:score, m[:score]]
|
286
|
-
end
|
287
|
-
elsif m[:minute]
|
288
|
-
if typed
|
289
|
-
minute = {}
|
290
|
-
minute[:m] = m[:value].to_i(10)
|
291
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
292
|
-
## note - for debugging keep (pass along) "literal" minute
|
293
|
-
[:minute, m[:minute], minute]
|
294
|
-
else
|
295
|
-
[:minute, m[:minute]]
|
296
|
-
end
|
297
|
-
elsif m[:og]
|
298
|
-
typed ? [:og] : [:og, m[:og]] ## for typed drop - string version/variants
|
299
|
-
elsif m[:pen]
|
300
|
-
typed ? [:pen] : [:pen, m[:pen]]
|
301
|
-
elsif m[:vs]
|
302
|
-
typed ? [:vs] : [:vs, m[:vs]]
|
303
|
-
elsif m[:none]
|
304
|
-
typed ? [:none] : [:none, m[:none]]
|
305
|
-
elsif m[:sym]
|
306
|
-
sym = m[:sym]
|
307
|
-
## return symbols "inline" as is - why? why not?
|
308
|
-
case sym
|
309
|
-
when ',' then [:',']
|
310
|
-
when ';' then [:';']
|
311
|
-
when '@' then [:'@']
|
312
|
-
when '|' then [:'|']
|
313
|
-
else
|
314
|
-
nil ## ignore others (e.g. brackets [])
|
315
|
-
end
|
316
|
-
else
|
317
|
-
## report error
|
318
|
-
nil
|
319
|
-
end
|
320
|
-
|
321
|
-
tokens << t if t
|
322
|
-
|
323
|
-
if debug
|
324
|
-
print ">"
|
325
|
-
print "*" * pos
|
326
|
-
puts "#{line[pos..-1]}<"
|
327
|
-
end
|
328
|
-
end
|
329
|
-
|
330
|
-
## check if no match in end of string
|
331
|
-
if offsets[1] != line.size
|
332
|
-
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
333
|
-
puts msg
|
334
|
-
log( msg )
|
335
|
-
|
336
|
-
errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
|
337
|
-
end
|
338
|
-
|
339
|
-
|
340
|
-
[tokens,errors]
|
341
|
-
end
|
342
|
-
|
343
|
-
|
344
|
-
### convience helper - ignore errors by default
|
345
|
-
def tokenize( line, typed: false,
|
346
|
-
debug: false )
|
347
|
-
tokens, _ = tokenize_with_errors( line, typed: typed,
|
348
|
-
debug: debug )
|
349
|
-
tokens
|
350
|
-
end
|
351
|
-
|
352
|
-
|
353
268
|
end # class Parser
|
354
269
|
end # module SportDb
|
@@ -0,0 +1,262 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
def log( msg )
|
8
|
+
## append msg to ./logs.txt
|
9
|
+
## use ./errors.txt - why? why not?
|
10
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
11
|
+
f.write( msg )
|
12
|
+
f.write( "\n" )
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def tokenize_with_errors( line, debug: false )
|
19
|
+
tokens = []
|
20
|
+
errors = [] ## keep a list of errors - why? why not?
|
21
|
+
|
22
|
+
puts ">#{line}<" if debug
|
23
|
+
|
24
|
+
pos = 0
|
25
|
+
## track last offsets - to report error on no match
|
26
|
+
## or no match in end of string
|
27
|
+
offsets = [0,0]
|
28
|
+
m = nil
|
29
|
+
|
30
|
+
|
31
|
+
####
|
32
|
+
## quick hack - keep re state/mode between tokenize calls!!!
|
33
|
+
@re ||= RE ## note - switch between RE & INSIDE_RE
|
34
|
+
|
35
|
+
|
36
|
+
while m = @re.match( line, pos )
|
37
|
+
if debug
|
38
|
+
pp m
|
39
|
+
puts "pos: #{pos}"
|
40
|
+
end
|
41
|
+
offsets = [m.begin(0), m.end(0)]
|
42
|
+
|
43
|
+
if offsets[0] != pos
|
44
|
+
## match NOT starting at start/begin position!!!
|
45
|
+
## report parse error!!!
|
46
|
+
msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
47
|
+
puts msg
|
48
|
+
|
49
|
+
errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
|
50
|
+
log( msg )
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
## todo/fix - also check if possible
|
55
|
+
## if no match but not yet end off string!!!!
|
56
|
+
## report skipped text run too!!!
|
57
|
+
|
58
|
+
pos = offsets[1]
|
59
|
+
|
60
|
+
pp offsets if debug
|
61
|
+
|
62
|
+
##
|
63
|
+
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
64
|
+
## for VAL use "text" or ["text", { opts }] array
|
65
|
+
|
66
|
+
|
67
|
+
t = if @re == PROP_RE
|
68
|
+
if m[:space]
|
69
|
+
## skip space
|
70
|
+
nil
|
71
|
+
elsif m[:spaces]
|
72
|
+
## skip spaces
|
73
|
+
nil
|
74
|
+
elsif m[:prop_name]
|
75
|
+
if m[:name] == 'Y'
|
76
|
+
[:YELLOW_CARD, m[:name]]
|
77
|
+
elsif m[:name] == 'R'
|
78
|
+
[:RED_CARD, m[:name]]
|
79
|
+
else
|
80
|
+
[:PROP_NAME, m[:name]]
|
81
|
+
end
|
82
|
+
elsif m[:minute]
|
83
|
+
minute = {}
|
84
|
+
minute[:m] = m[:value].to_i(10)
|
85
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
86
|
+
## note - for debugging keep (pass along) "literal" minute
|
87
|
+
[:MINUTE, [m[:minute], minute]]
|
88
|
+
elsif m[:sym]
|
89
|
+
sym = m[:sym]
|
90
|
+
## return symbols "inline" as is - why? why not?
|
91
|
+
## (?<sym>[;,@|\[\]-])
|
92
|
+
|
93
|
+
case sym
|
94
|
+
when ',' then [:',']
|
95
|
+
when ';' then [:';']
|
96
|
+
when '[' then [:'[']
|
97
|
+
when ']' then [:']']
|
98
|
+
when '(' then [:'(']
|
99
|
+
when ')' then [:')']
|
100
|
+
when '-' then [:'-']
|
101
|
+
when '.' then
|
102
|
+
## switch back to top-level mode!!
|
103
|
+
puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"
|
104
|
+
@re = RE
|
105
|
+
[:'.']
|
106
|
+
else
|
107
|
+
nil ## ignore others (e.g. brackets [])
|
108
|
+
end
|
109
|
+
else
|
110
|
+
## report error
|
111
|
+
puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
|
112
|
+
nil
|
113
|
+
end
|
114
|
+
else ## assume TOP_LEVEL (a.k.a. RE) machinery
|
115
|
+
if m[:space]
|
116
|
+
## skip space
|
117
|
+
nil
|
118
|
+
elsif m[:spaces]
|
119
|
+
## skip spaces
|
120
|
+
nil
|
121
|
+
elsif m[:prop_key]
|
122
|
+
## switch context to PROP_RE
|
123
|
+
@re = PROP_RE
|
124
|
+
puts " ENTER PROP_RE MODE"
|
125
|
+
[:PROP, m[:key]]
|
126
|
+
elsif m[:text]
|
127
|
+
[:TEXT, m[:text]] ## keep pos - why? why not?
|
128
|
+
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
129
|
+
## todo/check - add text (or status)
|
130
|
+
# to opts hash {} by default (for value)
|
131
|
+
if m[:status_note] ## includes note? e.g. awarded; originally 2-0
|
132
|
+
[:STATUS, [m[:status], {status: m[:status],
|
133
|
+
note: m[:status_note]} ]]
|
134
|
+
else
|
135
|
+
[:STATUS, [m[:status], {status: m[:status] } ]]
|
136
|
+
end
|
137
|
+
elsif m[:time]
|
138
|
+
## unify to iso-format
|
139
|
+
### 12.40 => 12:40
|
140
|
+
## 12h40 => 12:40 etc.
|
141
|
+
## keep string (no time-only type in ruby)
|
142
|
+
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
143
|
+
minute = m[:minute].to_i(10)
|
144
|
+
## check if valid - 0:00 - 24:00
|
145
|
+
## check if 24:00 possible? or only 0:00 (23:59)
|
146
|
+
if (hour >= 0 && hour <= 24) &&
|
147
|
+
(minute >=0 && minute <= 59)
|
148
|
+
## note - for debugging keep (pass along) "literal" time
|
149
|
+
## might use/add support for am/pm later
|
150
|
+
[:TIME, [m[:time], {h:hour,m:minute}]]
|
151
|
+
else
|
152
|
+
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
153
|
+
end
|
154
|
+
elsif m[:date]
|
155
|
+
date = {}
|
156
|
+
## map month names
|
157
|
+
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
158
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
159
|
+
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
160
|
+
date[:d] = m[:day].to_i(10) if m[:day]
|
161
|
+
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
162
|
+
## note - for debugging keep (pass along) "literal" date
|
163
|
+
[:DATE, [m[:date], date]]
|
164
|
+
elsif m[:timezone]
|
165
|
+
[:TIMEZONE, m[:timezone]]
|
166
|
+
elsif m[:duration]
|
167
|
+
## todo/check/fix - if end: works for kwargs!!!!!
|
168
|
+
duration = { start: {}, end: {}}
|
169
|
+
duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
|
170
|
+
duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
|
171
|
+
duration[:start][:d] = m[:day1].to_i(10) if m[:day1]
|
172
|
+
duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1]
|
173
|
+
duration[:end][:y] = m[:year2].to_i(10) if m[:year2]
|
174
|
+
duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2]
|
175
|
+
duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
|
176
|
+
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
177
|
+
## note - for debugging keep (pass along) "literal" duration
|
178
|
+
[:DURATION, [m[:duration], duration]]
|
179
|
+
elsif m[:num] ## fix - change to ord (for ordinal number!!!)
|
180
|
+
## note - strip enclosing () and convert to integer
|
181
|
+
[:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
|
182
|
+
elsif m[:score]
|
183
|
+
score = {}
|
184
|
+
## check for pen
|
185
|
+
score[:p] = [m[:p1].to_i(10),
|
186
|
+
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
187
|
+
score[:et] = [m[:et1].to_i(10),
|
188
|
+
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
189
|
+
score[:ft] = [m[:ft1].to_i(10),
|
190
|
+
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
191
|
+
score[:ht] = [m[:ht1].to_i(10),
|
192
|
+
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
193
|
+
|
194
|
+
## note - for debugging keep (pass along) "literal" score
|
195
|
+
[:SCORE, [m[:score], score]]
|
196
|
+
elsif m[:minute]
|
197
|
+
minute = {}
|
198
|
+
minute[:m] = m[:value].to_i(10)
|
199
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
200
|
+
## note - for debugging keep (pass along) "literal" minute
|
201
|
+
[:MINUTE, [m[:minute], minute]]
|
202
|
+
elsif m[:og]
|
203
|
+
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
204
|
+
elsif m[:pen]
|
205
|
+
[:PEN, m[:pen]]
|
206
|
+
elsif m[:vs]
|
207
|
+
[:VS, m[:vs]]
|
208
|
+
elsif m[:sym]
|
209
|
+
sym = m[:sym]
|
210
|
+
## return symbols "inline" as is - why? why not?
|
211
|
+
## (?<sym>[;,@|\[\]-])
|
212
|
+
|
213
|
+
case sym
|
214
|
+
when ',' then [:',']
|
215
|
+
when ';' then [:';']
|
216
|
+
when '@' then [:'@']
|
217
|
+
when '|' then [:'|']
|
218
|
+
when '[' then [:'[']
|
219
|
+
when ']' then [:']']
|
220
|
+
when '-' then [:'-']
|
221
|
+
else
|
222
|
+
nil ## ignore others (e.g. brackets [])
|
223
|
+
end
|
224
|
+
else
|
225
|
+
## report error
|
226
|
+
puts "!!! TOKENIZE ERROR - no match found"
|
227
|
+
nil
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
|
232
|
+
tokens << t if t
|
233
|
+
|
234
|
+
if debug
|
235
|
+
print ">"
|
236
|
+
print "*" * pos
|
237
|
+
puts "#{line[pos..-1]}<"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
## check if no match in end of string
|
242
|
+
if offsets[1] != line.size
|
243
|
+
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
244
|
+
puts msg
|
245
|
+
log( msg )
|
246
|
+
|
247
|
+
errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
|
248
|
+
end
|
249
|
+
|
250
|
+
|
251
|
+
[tokens,errors]
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
### convience helper - ignore errors by default
|
256
|
+
def tokenize( line, debug: false )
|
257
|
+
tokens, _ = tokenize_with_errors( line, debug: debug )
|
258
|
+
tokens
|
259
|
+
end
|
260
|
+
|
261
|
+
end # class Parser
|
262
|
+
end # module SportDb
|