sportdb-parser 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/lib/sportdb/parser/lexer.rb +63 -10
- data/lib/sportdb/parser/parser.rb +521 -404
- data/lib/sportdb/parser/racc_parser.rb +4 -2
- data/lib/sportdb/parser/token-date.rb +66 -15
- data/lib/sportdb/parser/token-minute.rb +19 -4
- data/lib/sportdb/parser/token-score.rb +25 -14
- data/lib/sportdb/parser/token-status.rb +109 -0
- data/lib/sportdb/parser/token.rb +13 -2
- data/lib/sportdb/parser/version.rb +1 -1
- metadata +2 -2
@@ -68,12 +68,14 @@ def initialize( txt, debug: false )
|
|
68
68
|
|
69
69
|
|
70
70
|
def on_error(error_token_id, error_value, value_stack)
|
71
|
-
|
71
|
+
## auto-add error_token (as string)
|
72
|
+
error_token = Racc_token_to_s_table[error_token_id]
|
73
|
+
args = [error_token, error_token_id, error_value, value_stack]
|
72
74
|
puts
|
73
75
|
puts "!! on parse error:"
|
74
76
|
puts "args=#{args.pretty_inspect}"
|
75
77
|
|
76
|
-
@errors << "parse error on token: #{error_token_id} with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
|
78
|
+
@errors << "parse error on token: #{error_token} (#{error_token_id}) with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
|
77
79
|
## exit 1 ## exit for now - get and print more info about context etc.!!
|
78
80
|
end
|
79
81
|
|
@@ -147,15 +147,15 @@ DATE_II_RE = %r{
|
|
147
147
|
|
148
148
|
|
149
149
|
# e.g. iso-date - 2011-08-25
|
150
|
-
##
|
150
|
+
## note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.
|
151
151
|
DATE_III_RE = %r{
|
152
152
|
(?<date>
|
153
153
|
\b
|
154
154
|
(?<year>\d{4})
|
155
155
|
-
|
156
|
-
(?<month>\d{2})
|
156
|
+
(?<month>\d{1,2})
|
157
157
|
-
|
158
|
-
(?<day>\d{2})
|
158
|
+
(?<day>\d{1,2})
|
159
159
|
\b
|
160
160
|
)}ix
|
161
161
|
|
@@ -214,29 +214,36 @@ end
|
|
214
214
|
#
|
215
215
|
# Sun Jun/23 - Wed Jun/26 -- YES
|
216
216
|
# Jun/23 - Jun/26 -- YES
|
217
|
-
#
|
218
|
-
|
219
|
-
#
|
220
|
-
# Jun/25
|
217
|
+
# Jun/25 - 26 - why? why not??? - YES - see blow variant iii!!!
|
218
|
+
|
219
|
+
# Tue Jun/25 + Wed Jun/26 -- NO
|
220
|
+
# Jun/25 + Jun/26 -- NO
|
221
221
|
# Jun/25 .. 26 - why? why not???
|
222
222
|
# Jun/25 to 26 - why? why not???
|
223
223
|
# Jun/25 + 26 - add - why? why not???
|
224
224
|
# Sun-Wed Jun/23-26 - add - why? why not???
|
225
225
|
# Wed+Thu Jun/26+27 2024 - add - why? why not???
|
226
226
|
#
|
227
|
-
# maybe use
|
227
|
+
# maybe use comma and plus for list of dates
|
228
228
|
# Tue Jun/25, Wed Jun/26, Thu Jun/27 ??
|
229
229
|
# Tue Jun/25 + Wed Jun/26 + Thu Jun/27 ??
|
230
230
|
#
|
231
231
|
# add back optional comma (before) year - why? why not?
|
232
|
+
#
|
232
233
|
|
233
234
|
|
234
235
|
##
|
235
236
|
# todo add plus later on - why? why not?
|
237
|
+
### todo/fix add optional comma (,) before year
|
238
|
+
|
239
|
+
### regex note/tip/remindr - \b () \b MUST always get enclosed in parantheses
|
240
|
+
## because alternation (|) has lowest priority/binding
|
241
|
+
|
236
242
|
|
237
243
|
DURATION_I_RE = %r{
|
238
244
|
(?<duration>
|
239
245
|
\b
|
246
|
+
(?:
|
240
247
|
## optional day name
|
241
248
|
((?<day_name1>#{DAY_NAMES})
|
242
249
|
[ ]
|
@@ -245,12 +252,13 @@ DURATION_I_RE = %r{
|
|
245
252
|
(?: \/|[ ] )
|
246
253
|
(?<day1>\d{1,2})
|
247
254
|
## optional year
|
248
|
-
(
|
255
|
+
( ,? # optional comma
|
256
|
+
[ ]
|
249
257
|
(?<year1>\d{4})
|
250
258
|
)?
|
251
259
|
|
252
260
|
## support + and - (add .. or such - why??)
|
253
|
-
[ ]*
|
261
|
+
[ ]* - [ ]*
|
254
262
|
|
255
263
|
## optional day name
|
256
264
|
((?<day_name2>#{DAY_NAMES})
|
@@ -260,20 +268,28 @@ DURATION_I_RE = %r{
|
|
260
268
|
(?: \/|[ ] )
|
261
269
|
(?<day2>\d{1,2})
|
262
270
|
## optional year
|
263
|
-
(
|
271
|
+
( ,? # optional comma
|
272
|
+
[ ]
|
264
273
|
(?<year2>\d{4})
|
265
274
|
)?
|
275
|
+
)
|
266
276
|
\b
|
267
277
|
)}ix
|
268
278
|
|
269
279
|
|
280
|
+
|
281
|
+
# FIX - remove this variant
|
282
|
+
# "standardize on month day [year]" !!!!
|
283
|
+
|
284
|
+
=begin
|
270
285
|
###
|
271
286
|
# variant ii
|
272
287
|
# e.g. 26 July - 27 July
|
273
|
-
|
274
|
-
|
288
|
+
# 26 July,
|
289
|
+
XXX_DURATION_II_RE = %r{
|
275
290
|
(?<duration>
|
276
291
|
\b
|
292
|
+
(?
|
277
293
|
## optional day name
|
278
294
|
((?<day_name1>#{DAY_NAMES})
|
279
295
|
[ ]
|
@@ -282,7 +298,8 @@ DURATION_II_RE = %r{
|
|
282
298
|
[ ]
|
283
299
|
(?<month_name1>#{MONTH_NAMES})
|
284
300
|
## optional year
|
285
|
-
(
|
301
|
+
(
|
302
|
+
[ ]
|
286
303
|
(?<year1>\d{4})
|
287
304
|
)?
|
288
305
|
|
@@ -300,16 +317,50 @@ DURATION_II_RE = %r{
|
|
300
317
|
( [ ]
|
301
318
|
(?<year2>\d{4})
|
302
319
|
)?
|
320
|
+
)
|
321
|
+
\b
|
322
|
+
)}ix
|
323
|
+
=end
|
324
|
+
|
325
|
+
|
326
|
+
# variant ii
|
327
|
+
# add support for shorthand
|
328
|
+
# August 16-18, 2011
|
329
|
+
# September 13-15, 2011
|
330
|
+
# October 18-20, 2011
|
331
|
+
# March/6-8, 2012
|
332
|
+
# March 6-8 2012
|
333
|
+
# March 6-8
|
334
|
+
#
|
335
|
+
# - add support for August 16+17 or such (and check 16+18)
|
336
|
+
# use <op> to check if day2 is a plus or range or such - why? why not?
|
337
|
+
|
338
|
+
DURATION_II_RE = %r{
|
339
|
+
(?<duration>
|
340
|
+
\b
|
341
|
+
(?:
|
342
|
+
(?<month_name1>#{MONTH_NAMES})
|
343
|
+
[ /]
|
344
|
+
(?<day1>\d{1,2})
|
345
|
+
-
|
346
|
+
(?<day2>\d{1,2})
|
347
|
+
(?:
|
348
|
+
,? ## optional comma
|
349
|
+
[ ]
|
350
|
+
(?<year1>\d{4})
|
351
|
+
)? ## optional year
|
352
|
+
)
|
303
353
|
\b
|
304
354
|
)}ix
|
305
355
|
|
306
356
|
|
357
|
+
|
307
358
|
#############################################
|
308
359
|
# map tables
|
309
360
|
# note: order matters; first come-first matched/served
|
310
361
|
DURATION_RE = Regexp.union(
|
311
362
|
DURATION_I_RE,
|
312
|
-
DURATION_II_RE
|
363
|
+
DURATION_II_RE,
|
313
364
|
)
|
314
365
|
|
315
366
|
|
@@ -26,14 +26,29 @@ GOAL_OG_RE = %r{
|
|
26
26
|
}ix
|
27
27
|
|
28
28
|
|
29
|
+
## minute variant for N/A not/available
|
30
|
+
## todo/check - find a better syntax - why? why not?
|
31
|
+
##
|
32
|
+
## note "??".to_i(10) returns 0 or
|
33
|
+
## "__".to_i(10) returns 0
|
34
|
+
## quick hack - assume 0 for n/a for now
|
35
|
+
|
36
|
+
MINUTE_NA_RE = %r{
|
37
|
+
(?<minute>
|
38
|
+
(?<=[ (]) # positive lookbehind for space or opening
|
39
|
+
(?<value> \?{2} | _{2} )
|
40
|
+
' ## must have minute marker!!!!
|
41
|
+
)
|
42
|
+
}ix
|
43
|
+
|
29
44
|
MINUTE_RE = %r{
|
30
45
|
(?<minute>
|
31
46
|
(?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
|
32
47
|
# todo - add more lookbehinds e.g. ,) etc. - why? why not?
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
48
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
49
|
+
(?: \+
|
50
|
+
(?<value2>\d{1,3})
|
51
|
+
)?
|
37
52
|
' ## must have minute marker!!!!
|
38
53
|
)
|
39
54
|
}ix
|
@@ -17,7 +17,7 @@ class Lexer
|
|
17
17
|
## 3-4 pen. 2-2 a.e.t.
|
18
18
|
## 2-2 a.e.t.
|
19
19
|
SCORE__P_ET__RE = %r{
|
20
|
-
(?<
|
20
|
+
(?<score_more>
|
21
21
|
\b
|
22
22
|
(?:
|
23
23
|
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
@@ -34,7 +34,7 @@ class Lexer
|
|
34
34
|
## note: allow SPECIAL with penalty only
|
35
35
|
## 3-4 pen.
|
36
36
|
SCORE__P__RE = %r{
|
37
|
-
(?<
|
37
|
+
(?<score_more>
|
38
38
|
\b
|
39
39
|
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
40
40
|
[ ]* #{P_EN}
|
@@ -52,7 +52,7 @@ class Lexer
|
|
52
52
|
## 2-2 a.e.t. (1-1)
|
53
53
|
|
54
54
|
SCORE__P_ET_FT_HT__RE = %r{
|
55
|
-
(?<
|
55
|
+
(?<score_more>
|
56
56
|
\b
|
57
57
|
(?:
|
58
58
|
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
@@ -79,7 +79,7 @@ class Lexer
|
|
79
79
|
## special case for case WITHOUT extra time!!
|
80
80
|
## same as above (but WITHOUT extra time and pen required)
|
81
81
|
SCORE__P_FT_HT__RE = %r{
|
82
|
-
(?<
|
82
|
+
(?<score_more>
|
83
83
|
\b
|
84
84
|
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
85
85
|
[ ]* #{P_EN} [ ]+
|
@@ -99,36 +99,47 @@ class Lexer
|
|
99
99
|
## note: \b works only after non-alphanum e.g. )
|
100
100
|
|
101
101
|
|
102
|
-
|
103
|
-
## e.g. 2-1 (1-1)
|
104
|
-
## 2-1
|
105
|
-
|
102
|
+
##########
|
103
|
+
## e.g. 2-1 (1-1)
|
106
104
|
SCORE__FT_HT__RE = %r{
|
107
|
-
(?<
|
105
|
+
(?<score_more>
|
108
106
|
\b
|
109
107
|
(?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
|
110
|
-
(?:
|
111
108
|
[ ]+ \( [ ]*
|
112
109
|
(?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
|
113
110
|
[ ]* \)
|
114
|
-
)? # note: make half time (HT) score optional for now
|
115
111
|
(?=[ ,\]]|$)
|
116
112
|
)}ix ## todo/check: remove loakahead assertion here - why require space?
|
117
113
|
## note: \b works only after non-alphanum e.g. )
|
118
114
|
|
119
|
-
|
115
|
+
#####
|
116
|
+
## 2-1
|
117
|
+
SCORE__FT__RE = %r{
|
118
|
+
(?<score>
|
119
|
+
\b
|
120
|
+
(?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
|
121
|
+
\b
|
122
|
+
)}ix
|
120
123
|
|
121
124
|
#############################################
|
122
125
|
# map tables
|
123
126
|
# note: order matters; first come-first matched/served
|
127
|
+
#
|
128
|
+
## check - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or ___ - why? why not?
|
124
129
|
|
125
|
-
|
130
|
+
SCORE_MORE_RE = Regexp.union(
|
126
131
|
SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
|
127
132
|
SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1)
|
128
133
|
SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t.
|
129
134
|
SCORE__P__RE, # e.g. 5-1 pen.
|
130
|
-
SCORE__FT_HT__RE, # e.g. 1-1 (1-0)
|
135
|
+
SCORE__FT_HT__RE, # e.g. 1-1 (1-0)
|
136
|
+
## note - keep basic score as its own token!!!!
|
137
|
+
## that is, SCORE & SCORE_MORE
|
138
|
+
### SCORE__FT__RE, # e.g. 1-1 -- note - must go last!!!
|
131
139
|
)
|
132
140
|
|
141
|
+
SCORE_RE = SCORE__FT__RE
|
142
|
+
|
143
|
+
|
133
144
|
end # class Lexer
|
134
145
|
end # module SportDb
|
@@ -12,10 +12,33 @@ STATUS_RE = %r{
|
|
12
12
|
(?:
|
13
13
|
### opt 1 - allow long forms with note/comment for some stati
|
14
14
|
(?: (?<status> awarded
|
15
|
+
## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
|
16
|
+
## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
|
17
|
+
## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
|
15
18
|
|
|
16
19
|
annulled
|
17
20
|
|
|
18
21
|
abandoned
|
22
|
+
## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
|
23
|
+
## [abandoned at 0-0 in 6' due to waterlogged pitch]
|
24
|
+
## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
|
25
|
+
## [abandoned at 1-0 in 31']
|
26
|
+
## [abandoned at 0-1' in 85 due to crowd trouble]
|
27
|
+
|
|
28
|
+
postponed
|
29
|
+
## e.g. [postponed due to problems with the screen of the stadium]
|
30
|
+
## [postponed by storm]
|
31
|
+
## [postponed due to tropical storm "Hanna"]
|
32
|
+
## [postponed from Sep 10-12 due to death Queen Elizabeth II]
|
33
|
+
|
|
34
|
+
suspended
|
35
|
+
## e.g. [suspended at 0-0 in 12' due to storm]
|
36
|
+
## [suspended at 84' by storm; result stood]
|
37
|
+
|
|
38
|
+
verified
|
39
|
+
## e.g. [verified 2:0 wo.]
|
40
|
+
|
41
|
+
|
19
42
|
) [ ;,]* (?<status_note> [^\]]+ )
|
20
43
|
[ ]*
|
21
44
|
)
|
@@ -34,12 +57,98 @@ STATUS_RE = %r{
|
|
34
57
|
replay
|
35
58
|
|
|
36
59
|
annulled
|
60
|
+
|
|
61
|
+
suspended ### todo/fix - add status upstream - why? why not?
|
62
|
+
### move to note(s) - do NOT interpret as status - why? why not?
|
63
|
+
|
|
64
|
+
verified ### todo/fix - add status upstream (same as ??) - why? why not?
|
65
|
+
### move to note(s) - do NOT interpret as status - why? why not?
|
37
66
|
)
|
38
67
|
)
|
39
68
|
\]
|
40
69
|
}ix
|
41
70
|
|
42
71
|
|
72
|
+
|
73
|
+
|
74
|
+
###
|
75
|
+
## todo/fix - move to token-note.rb (standalone) file
|
76
|
+
|
77
|
+
NOTE_RE = %r{
|
78
|
+
\[
|
79
|
+
(?<note>
|
80
|
+
(?: ## starting with ___ PLUS requiring more text
|
81
|
+
(?:
|
82
|
+
nb:
|
83
|
+
## e.g. [NB: between top-8 of regular season]
|
84
|
+
# [NB: América, Morelia and Tigres qualified on better record regular season]
|
85
|
+
# [NB: Celaya qualified on away goals]
|
86
|
+
# [NB: Alebrijes qualified on away goal]
|
87
|
+
# [NB: Leones Negros qualified on away goals]
|
88
|
+
#
|
89
|
+
# todo/fix:
|
90
|
+
# add "top-level" NB: version
|
91
|
+
## with full (end-of) line note - why? why not?
|
92
|
+
|
|
93
|
+
(?: originally[ ])? scheduled
|
94
|
+
## e.g. [originally scheduled to play in Mexico City]
|
95
|
+
|
|
96
|
+
rescheduled
|
97
|
+
## e.g. [Rescheduled due to earthquake occurred in Mexico on September 19]
|
98
|
+
|
|
99
|
+
remaining
|
100
|
+
## e.g. [remaining 79']
|
101
|
+
## [remaining 84']
|
102
|
+
## [remaining 59']
|
103
|
+
## [remaining 5']
|
104
|
+
|
|
105
|
+
played
|
106
|
+
## e.g. [played in Macaé-RJ]
|
107
|
+
## [played in Caxias do Sul-RS]
|
108
|
+
## [played in Sete Lagoas-MG]
|
109
|
+
## [played in Uberlândia-MG]
|
110
|
+
## [played in Brasília-DF]
|
111
|
+
## [played in Vöcklabruck]
|
112
|
+
## [played in Pasching]
|
113
|
+
|
|
114
|
+
declared
|
115
|
+
## e.g. [declared void]
|
116
|
+
|
|
117
|
+
inter-group
|
118
|
+
## e.g. [inter-group A-B]
|
119
|
+
## [inter-group C-D]
|
120
|
+
)
|
121
|
+
[ ]
|
122
|
+
[^\]]+? ## slurp all to next ] - (use non-greedy)
|
123
|
+
)
|
124
|
+
|
|
125
|
+
(?:
|
126
|
+
## starting with in - do NOT allow digits
|
127
|
+
## name starting with in possible - why? why not?
|
128
|
+
in[ ]
|
129
|
+
[^0-9\]]+?
|
130
|
+
## e.g. [In Estadio La Corregidora]
|
131
|
+
## [in Unidad Deportiva Centenario]
|
132
|
+
## [in Estadio Olímpico Universitario]
|
133
|
+
## [in Estadio Victoria]
|
134
|
+
## [in UD José Brindis]
|
135
|
+
## [in Colomos Alfredo "Pistache" Torres stadium]
|
136
|
+
)
|
137
|
+
|
|
138
|
+
(?:
|
139
|
+
## e.g. Spain wins on penalties
|
140
|
+
## 1860 München wins on penalties etc.
|
141
|
+
## must start with digit 1-9 or letter
|
142
|
+
## todo - add more special chars - why? why not?
|
143
|
+
[1-9\p{L}][0-9\p{L} .-]+?
|
144
|
+
[ ]wins[ ]on[ ]penalties
|
145
|
+
[^\]]*? ## use non-greedy
|
146
|
+
)
|
147
|
+
)
|
148
|
+
\]
|
149
|
+
}ix
|
150
|
+
|
151
|
+
|
43
152
|
end # class Lexer
|
44
153
|
end # module SportDb
|
45
154
|
|
data/lib/sportdb/parser/token.rb
CHANGED
@@ -111,7 +111,15 @@ BASICS_RE = %r{
|
|
111
111
|
(?<spaces> [ ]{2,}) |
|
112
112
|
(?<space> [ ])
|
113
113
|
|
|
114
|
-
(?<sym>[
|
114
|
+
(?<sym> (?<=^|[ ]) ## positive lookahead
|
115
|
+
(?: ----|
|
116
|
+
---|
|
117
|
+
--
|
118
|
+
)
|
119
|
+
(?=[ ]) ## positive lookahead
|
120
|
+
)
|
121
|
+
|
|
122
|
+
(?<sym> [;,/@|\[\]-] )
|
115
123
|
}ix
|
116
124
|
|
117
125
|
|
@@ -119,14 +127,17 @@ BASICS_RE = %r{
|
|
119
127
|
|
120
128
|
RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
|
121
129
|
STATUS_RE,
|
130
|
+
NOTE_RE,
|
122
131
|
TIMEZONE_RE,
|
123
132
|
TIME_RE,
|
124
133
|
DURATION_RE, # note - duration MUST match before date
|
125
134
|
DATE_RE,
|
126
135
|
WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
|
127
|
-
|
136
|
+
SCORE_MORE_RE,
|
137
|
+
SCORE_RE, ## note basic score e.g. 1-1 must go after SCORE_MORE_RE!!!
|
128
138
|
BASICS_RE,
|
129
139
|
MINUTE_RE,
|
140
|
+
MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now
|
130
141
|
GOAL_OG_RE, GOAL_PEN_RE,
|
131
142
|
TEXT_RE )
|
132
143
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|