sportdb-parser 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,12 +68,14 @@ def initialize( txt, debug: false )
68
68
 
69
69
 
70
70
  def on_error(error_token_id, error_value, value_stack)
71
- args = [error_token_id, error_value, value_stack]
71
+ ## auto-add error_token (as string)
72
+ error_token = Racc_token_to_s_table[error_token_id]
73
+ args = [error_token, error_token_id, error_value, value_stack]
72
74
  puts
73
75
  puts "!! on parse error:"
74
76
  puts "args=#{args.pretty_inspect}"
75
77
 
76
- @errors << "parse error on token: #{error_token_id} with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
78
+ @errors << "parse error on token: #{error_token} (#{error_token_id}) with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
77
79
  ## exit 1 ## exit for now - get and print more info about context etc.!!
78
80
  end
79
81
 
@@ -147,15 +147,15 @@ DATE_II_RE = %r{
147
147
 
148
148
 
149
149
  # e.g. iso-date - 2011-08-25
150
- ## todo/check - allow 2011-8-25 or 2011-8-3 / 2011-08-03 etc. - why? why not?
150
+ ## note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.
151
151
  DATE_III_RE = %r{
152
152
  (?<date>
153
153
  \b
154
154
  (?<year>\d{4})
155
155
  -
156
- (?<month>\d{2})
156
+ (?<month>\d{1,2})
157
157
  -
158
- (?<day>\d{2})
158
+ (?<day>\d{1,2})
159
159
  \b
160
160
  )}ix
161
161
 
@@ -214,29 +214,36 @@ end
214
214
  #
215
215
  # Sun Jun/23 - Wed Jun/26 -- YES
216
216
  # Jun/23 - Jun/26 -- YES
217
- # Tue Jun/25 + Wed Jun/26 -- YES
218
- # Jun/25 + Jun/26 -- YES
219
- #
220
- # Jun/25 - 26 - why? why not???
217
+ # Jun/25 - 26 - why? why not??? - YES - see blow variant iii!!!
218
+
219
+ # Tue Jun/25 + Wed Jun/26 -- NO
220
+ # Jun/25 + Jun/26 -- NO
221
221
  # Jun/25 .. 26 - why? why not???
222
222
  # Jun/25 to 26 - why? why not???
223
223
  # Jun/25 + 26 - add - why? why not???
224
224
  # Sun-Wed Jun/23-26 - add - why? why not???
225
225
  # Wed+Thu Jun/26+27 2024 - add - why? why not???
226
226
  #
227
- # maybe use comman and plus for list of dates
227
+ # maybe use comma and plus for list of dates
228
228
  # Tue Jun/25, Wed Jun/26, Thu Jun/27 ??
229
229
  # Tue Jun/25 + Wed Jun/26 + Thu Jun/27 ??
230
230
  #
231
231
  # add back optional comma (before) year - why? why not?
232
+ #
232
233
 
233
234
 
234
235
  ##
235
236
  # todo add plus later on - why? why not?
237
+ ### todo/fix add optional comma (,) before year
238
+
239
+ ### regex note/tip/remindr - \b () \b MUST always get enclosed in parantheses
240
+ ## because alternation (|) has lowest priority/binding
241
+
236
242
 
237
243
  DURATION_I_RE = %r{
238
244
  (?<duration>
239
245
  \b
246
+ (?:
240
247
  ## optional day name
241
248
  ((?<day_name1>#{DAY_NAMES})
242
249
  [ ]
@@ -245,12 +252,13 @@ DURATION_I_RE = %r{
245
252
  (?: \/|[ ] )
246
253
  (?<day1>\d{1,2})
247
254
  ## optional year
248
- ( [ ]
255
+ ( ,? # optional comma
256
+ [ ]
249
257
  (?<year1>\d{4})
250
258
  )?
251
259
 
252
260
  ## support + and - (add .. or such - why??)
253
- [ ]*[-][ ]*
261
+ [ ]* - [ ]*
254
262
 
255
263
  ## optional day name
256
264
  ((?<day_name2>#{DAY_NAMES})
@@ -260,20 +268,28 @@ DURATION_I_RE = %r{
260
268
  (?: \/|[ ] )
261
269
  (?<day2>\d{1,2})
262
270
  ## optional year
263
- ( [ ]
271
+ ( ,? # optional comma
272
+ [ ]
264
273
  (?<year2>\d{4})
265
274
  )?
275
+ )
266
276
  \b
267
277
  )}ix
268
278
 
269
279
 
280
+
281
+ # FIX - remove this variant
282
+ # "standardize on month day [year]" !!!!
283
+
284
+ =begin
270
285
  ###
271
286
  # variant ii
272
287
  # e.g. 26 July - 27 July
273
-
274
- DURATION_II_RE = %r{
288
+ # 26 July,
289
+ XXX_DURATION_II_RE = %r{
275
290
  (?<duration>
276
291
  \b
292
+ (?
277
293
  ## optional day name
278
294
  ((?<day_name1>#{DAY_NAMES})
279
295
  [ ]
@@ -282,7 +298,8 @@ DURATION_II_RE = %r{
282
298
  [ ]
283
299
  (?<month_name1>#{MONTH_NAMES})
284
300
  ## optional year
285
- ( [ ]
301
+ (
302
+ [ ]
286
303
  (?<year1>\d{4})
287
304
  )?
288
305
 
@@ -300,16 +317,50 @@ DURATION_II_RE = %r{
300
317
  ( [ ]
301
318
  (?<year2>\d{4})
302
319
  )?
320
+ )
321
+ \b
322
+ )}ix
323
+ =end
324
+
325
+
326
+ # variant ii
327
+ # add support for shorthand
328
+ # August 16-18, 2011
329
+ # September 13-15, 2011
330
+ # October 18-20, 2011
331
+ # March/6-8, 2012
332
+ # March 6-8 2012
333
+ # March 6-8
334
+ #
335
+ # - add support for August 16+17 or such (and check 16+18)
336
+ # use <op> to check if day2 is a plus or range or such - why? why not?
337
+
338
+ DURATION_II_RE = %r{
339
+ (?<duration>
340
+ \b
341
+ (?:
342
+ (?<month_name1>#{MONTH_NAMES})
343
+ [ /]
344
+ (?<day1>\d{1,2})
345
+ -
346
+ (?<day2>\d{1,2})
347
+ (?:
348
+ ,? ## optional comma
349
+ [ ]
350
+ (?<year1>\d{4})
351
+ )? ## optional year
352
+ )
303
353
  \b
304
354
  )}ix
305
355
 
306
356
 
357
+
307
358
  #############################################
308
359
  # map tables
309
360
  # note: order matters; first come-first matched/served
310
361
  DURATION_RE = Regexp.union(
311
362
  DURATION_I_RE,
312
- DURATION_II_RE
363
+ DURATION_II_RE,
313
364
  )
314
365
 
315
366
 
@@ -26,14 +26,29 @@ GOAL_OG_RE = %r{
26
26
  }ix
27
27
 
28
28
 
29
+ ## minute variant for N/A not/available
30
+ ## todo/check - find a better syntax - why? why not?
31
+ ##
32
+ ## note "??".to_i(10) returns 0 or
33
+ ## "__".to_i(10) returns 0
34
+ ## quick hack - assume 0 for n/a for now
35
+
36
+ MINUTE_NA_RE = %r{
37
+ (?<minute>
38
+ (?<=[ (]) # positive lookbehind for space or opening
39
+ (?<value> \?{2} | _{2} )
40
+ ' ## must have minute marker!!!!
41
+ )
42
+ }ix
43
+
29
44
  MINUTE_RE = %r{
30
45
  (?<minute>
31
46
  (?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
32
47
  # todo - add more lookbehinds e.g. ,) etc. - why? why not?
33
- (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
34
- (?: \+
35
- (?<value2>\d{1,3})
36
- )?
48
+ (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
49
+ (?: \+
50
+ (?<value2>\d{1,3})
51
+ )?
37
52
  ' ## must have minute marker!!!!
38
53
  )
39
54
  }ix
@@ -17,7 +17,7 @@ class Lexer
17
17
  ## 3-4 pen. 2-2 a.e.t.
18
18
  ## 2-2 a.e.t.
19
19
  SCORE__P_ET__RE = %r{
20
- (?<score>
20
+ (?<score_more>
21
21
  \b
22
22
  (?:
23
23
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
@@ -34,7 +34,7 @@ class Lexer
34
34
  ## note: allow SPECIAL with penalty only
35
35
  ## 3-4 pen.
36
36
  SCORE__P__RE = %r{
37
- (?<score>
37
+ (?<score_more>
38
38
  \b
39
39
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
40
40
  [ ]* #{P_EN}
@@ -52,7 +52,7 @@ class Lexer
52
52
  ## 2-2 a.e.t. (1-1)
53
53
 
54
54
  SCORE__P_ET_FT_HT__RE = %r{
55
- (?<score>
55
+ (?<score_more>
56
56
  \b
57
57
  (?:
58
58
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
@@ -79,7 +79,7 @@ class Lexer
79
79
  ## special case for case WITHOUT extra time!!
80
80
  ## same as above (but WITHOUT extra time and pen required)
81
81
  SCORE__P_FT_HT__RE = %r{
82
- (?<score>
82
+ (?<score_more>
83
83
  \b
84
84
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
85
85
  [ ]* #{P_EN} [ ]+
@@ -99,36 +99,47 @@ class Lexer
99
99
  ## note: \b works only after non-alphanum e.g. )
100
100
 
101
101
 
102
-
103
- ## e.g. 2-1 (1-1) or
104
- ## 2-1
105
-
102
+ ##########
103
+ ## e.g. 2-1 (1-1)
106
104
  SCORE__FT_HT__RE = %r{
107
- (?<score>
105
+ (?<score_more>
108
106
  \b
109
107
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
110
- (?:
111
108
  [ ]+ \( [ ]*
112
109
  (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
113
110
  [ ]* \)
114
- )? # note: make half time (HT) score optional for now
115
111
  (?=[ ,\]]|$)
116
112
  )}ix ## todo/check: remove loakahead assertion here - why require space?
117
113
  ## note: \b works only after non-alphanum e.g. )
118
114
 
119
-
115
+ #####
116
+ ## 2-1
117
+ SCORE__FT__RE = %r{
118
+ (?<score>
119
+ \b
120
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
121
+ \b
122
+ )}ix
120
123
 
121
124
  #############################################
122
125
  # map tables
123
126
  # note: order matters; first come-first matched/served
127
+ #
128
+ ## check - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or ___ - why? why not?
124
129
 
125
- SCORE_RE = Regexp.union(
130
+ SCORE_MORE_RE = Regexp.union(
126
131
  SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
127
132
  SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1)
128
133
  SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t.
129
134
  SCORE__P__RE, # e.g. 5-1 pen.
130
- SCORE__FT_HT__RE, # e.g. 1-1 (1-0) or 1-1 -- note - must go last!!!
135
+ SCORE__FT_HT__RE, # e.g. 1-1 (1-0)
136
+ ## note - keep basic score as its own token!!!!
137
+ ## that is, SCORE & SCORE_MORE
138
+ ### SCORE__FT__RE, # e.g. 1-1 -- note - must go last!!!
131
139
  )
132
140
 
141
+ SCORE_RE = SCORE__FT__RE
142
+
143
+
133
144
  end # class Lexer
134
145
  end # module SportDb
@@ -12,10 +12,33 @@ STATUS_RE = %r{
12
12
  (?:
13
13
  ### opt 1 - allow long forms with note/comment for some stati
14
14
  (?: (?<status> awarded
15
+ ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
16
+ ## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
17
+ ## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
15
18
  |
16
19
  annulled
17
20
  |
18
21
  abandoned
22
+ ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
23
+ ## [abandoned at 0-0 in 6' due to waterlogged pitch]
24
+ ## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
25
+ ## [abandoned at 1-0 in 31']
26
+ ## [abandoned at 0-1' in 85 due to crowd trouble]
27
+ |
28
+ postponed
29
+ ## e.g. [postponed due to problems with the screen of the stadium]
30
+ ## [postponed by storm]
31
+ ## [postponed due to tropical storm "Hanna"]
32
+ ## [postponed from Sep 10-12 due to death Queen Elizabeth II]
33
+ |
34
+ suspended
35
+ ## e.g. [suspended at 0-0 in 12' due to storm]
36
+ ## [suspended at 84' by storm; result stood]
37
+ |
38
+ verified
39
+ ## e.g. [verified 2:0 wo.]
40
+
41
+
19
42
  ) [ ;,]* (?<status_note> [^\]]+ )
20
43
  [ ]*
21
44
  )
@@ -34,12 +57,98 @@ STATUS_RE = %r{
34
57
  replay
35
58
  |
36
59
  annulled
60
+ |
61
+ suspended ### todo/fix - add status upstream - why? why not?
62
+ ### move to note(s) - do NOT interpret as status - why? why not?
63
+ |
64
+ verified ### todo/fix - add status upstream (same as ??) - why? why not?
65
+ ### move to note(s) - do NOT interpret as status - why? why not?
37
66
  )
38
67
  )
39
68
  \]
40
69
  }ix
41
70
 
42
71
 
72
+
73
+
74
+ ###
75
+ ## todo/fix - move to token-note.rb (standalone) file
76
+
77
+ NOTE_RE = %r{
78
+ \[
79
+ (?<note>
80
+ (?: ## starting with ___ PLUS requiring more text
81
+ (?:
82
+ nb:
83
+ ## e.g. [NB: between top-8 of regular season]
84
+ # [NB: América, Morelia and Tigres qualified on better record regular season]
85
+ # [NB: Celaya qualified on away goals]
86
+ # [NB: Alebrijes qualified on away goal]
87
+ # [NB: Leones Negros qualified on away goals]
88
+ #
89
+ # todo/fix:
90
+ # add "top-level" NB: version
91
+ ## with full (end-of) line note - why? why not?
92
+ |
93
+ (?: originally[ ])? scheduled
94
+ ## e.g. [originally scheduled to play in Mexico City]
95
+ |
96
+ rescheduled
97
+ ## e.g. [Rescheduled due to earthquake occurred in Mexico on September 19]
98
+ |
99
+ remaining
100
+ ## e.g. [remaining 79']
101
+ ## [remaining 84']
102
+ ## [remaining 59']
103
+ ## [remaining 5']
104
+ |
105
+ played
106
+ ## e.g. [played in Macaé-RJ]
107
+ ## [played in Caxias do Sul-RS]
108
+ ## [played in Sete Lagoas-MG]
109
+ ## [played in Uberlândia-MG]
110
+ ## [played in Brasília-DF]
111
+ ## [played in Vöcklabruck]
112
+ ## [played in Pasching]
113
+ |
114
+ declared
115
+ ## e.g. [declared void]
116
+ |
117
+ inter-group
118
+ ## e.g. [inter-group A-B]
119
+ ## [inter-group C-D]
120
+ )
121
+ [ ]
122
+ [^\]]+? ## slurp all to next ] - (use non-greedy)
123
+ )
124
+ |
125
+ (?:
126
+ ## starting with in - do NOT allow digits
127
+ ## name starting with in possible - why? why not?
128
+ in[ ]
129
+ [^0-9\]]+?
130
+ ## e.g. [In Estadio La Corregidora]
131
+ ## [in Unidad Deportiva Centenario]
132
+ ## [in Estadio Olímpico Universitario]
133
+ ## [in Estadio Victoria]
134
+ ## [in UD José Brindis]
135
+ ## [in Colomos Alfredo "Pistache" Torres stadium]
136
+ )
137
+ |
138
+ (?:
139
+ ## e.g. Spain wins on penalties
140
+ ## 1860 München wins on penalties etc.
141
+ ## must start with digit 1-9 or letter
142
+ ## todo - add more special chars - why? why not?
143
+ [1-9\p{L}][0-9\p{L} .-]+?
144
+ [ ]wins[ ]on[ ]penalties
145
+ [^\]]*? ## use non-greedy
146
+ )
147
+ )
148
+ \]
149
+ }ix
150
+
151
+
43
152
  end # class Lexer
44
153
  end # module SportDb
45
154
 
@@ -111,7 +111,15 @@ BASICS_RE = %r{
111
111
  (?<spaces> [ ]{2,}) |
112
112
  (?<space> [ ])
113
113
  |
114
- (?<sym>[;,/@|\[\]-])
114
+ (?<sym> (?<=^|[ ]) ## positive lookahead
115
+ (?: ----|
116
+ ---|
117
+ --
118
+ )
119
+ (?=[ ]) ## positive lookahead
120
+ )
121
+ |
122
+ (?<sym> [;,/@|\[\]-] )
115
123
  }ix
116
124
 
117
125
 
@@ -119,14 +127,17 @@ BASICS_RE = %r{
119
127
 
120
128
  RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
121
129
  STATUS_RE,
130
+ NOTE_RE,
122
131
  TIMEZONE_RE,
123
132
  TIME_RE,
124
133
  DURATION_RE, # note - duration MUST match before date
125
134
  DATE_RE,
126
135
  WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
127
- SCORE_RE,
136
+ SCORE_MORE_RE,
137
+ SCORE_RE, ## note basic score e.g. 1-1 must go after SCORE_MORE_RE!!!
128
138
  BASICS_RE,
129
139
  MINUTE_RE,
140
+ MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now
130
141
  GOAL_OG_RE, GOAL_PEN_RE,
131
142
  TEXT_RE )
132
143
 
@@ -4,7 +4,7 @@ module SportDb
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
6
  MINOR = 6
7
- PATCH = 0
7
+ PATCH = 2
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-30 00:00:00.000000000 Z
11
+ date: 2025-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos