sportdb-parser 0.5.9 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,7 @@ def initialize( txt, debug: false )
14
14
 
15
15
  ### todo:
16
16
  ## - pass along debug flag
17
- lexer = SportDb::Lexer.new( txt )
17
+ lexer = SportDb::Lexer.new( txt, debug: debug )
18
18
  ## note - use tokenize_with_errors and add/collect tokenize errors
19
19
  @tokens, @errors = lexer.tokenize_with_errors
20
20
  ## pp @tokens
@@ -68,12 +68,14 @@ def initialize( txt, debug: false )
68
68
 
69
69
 
70
70
  def on_error(error_token_id, error_value, value_stack)
71
- args = [error_token_id, error_value, value_stack]
71
+ ## auto-add error_token (as string)
72
+ error_token = Racc_token_to_s_table[error_token_id]
73
+ args = [error_token, error_token_id, error_value, value_stack]
72
74
  puts
73
75
  puts "!! on parse error:"
74
76
  puts "args=#{args.pretty_inspect}"
75
77
 
76
- @errors << "parse error on token: #{error_token_id} with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
78
+ @errors << "parse error on token: #{error_token} (#{error_token_id}) with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
77
79
  ## exit 1 ## exit for now - get and print more info about context etc.!!
78
80
  end
79
81
 
@@ -71,10 +71,14 @@ RoundDef = Struct.new( :name, :date, :duration ) do
71
71
  end
72
72
  end
73
73
 
74
- DateHeader = Struct.new( :date ) do
74
+ DateHeader = Struct.new( :date, :time, :geo, :timezone ) do
75
75
  def pretty_print( printer )
76
76
  printer.text( "<DateHeader " )
77
- printer.text( "#{self.date.pretty_inspect}>" )
77
+ printer.text( "#{self.date.pretty_inspect}" )
78
+ printer.text( " time=#{self.time.pretty_inspect}" ) if self.time
79
+ printer.text( " geo=#{self.geo.pretty_inspect}" ) if self.geo
80
+ printer.text( " timezone=#{self.timezone}") if self.timezone
81
+ printer.text( ">")
78
82
  end
79
83
  end
80
84
 
@@ -85,14 +89,17 @@ GroupHeader = Struct.new( :name ) do
85
89
  end
86
90
  end
87
91
 
88
- RoundHeader = Struct.new( :names ) do
92
+ RoundHeader = Struct.new( :names, :group ) do
89
93
  def pretty_print( printer )
90
94
  printer.text( "<RoundHeader " )
91
- printer.text( "#{self.names.join(', ')}>" )
95
+ printer.text( "#{self.names.join(', ')}" )
96
+ printer.text( " group=#{self.group}") if self.group
97
+ printer.text( ">" )
92
98
  end
93
99
  end
94
100
 
95
- MatchLine = Struct.new( :ord, :date, :time,
101
+
102
+ MatchLine = Struct.new( :ord, :date, :time, :wday,
96
103
  :team1, :team2, :score,
97
104
  :status,
98
105
  :geo,
@@ -146,12 +146,29 @@ DATE_II_RE = %r{
146
146
  )}ix
147
147
 
148
148
 
149
+ # e.g. iso-date - 2011-08-25
150
+ ## note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.
151
+ DATE_III_RE = %r{
152
+ (?<date>
153
+ \b
154
+ (?<year>\d{4})
155
+ -
156
+ (?<month>\d{1,2})
157
+ -
158
+ (?<day>\d{1,2})
159
+ \b
160
+ )}ix
161
+
162
+
163
+
164
+
149
165
  #############################################
150
166
  # map tables
151
167
  # note: order matters; first come-first matched/served
152
168
  DATE_RE = Regexp.union(
153
169
  DATE_I_RE,
154
- DATE_II_RE
170
+ DATE_II_RE,
171
+ DATE_III_RE,
155
172
  )
156
173
 
157
174
 
@@ -197,29 +214,36 @@ end
197
214
  #
198
215
  # Sun Jun/23 - Wed Jun/26 -- YES
199
216
  # Jun/23 - Jun/26 -- YES
200
- # Tue Jun/25 + Wed Jun/26 -- YES
201
- # Jun/25 + Jun/26 -- YES
202
- #
203
- # Jun/25 - 26 - why? why not???
217
+ # Jun/25 - 26 - why? why not??? - YES - see blow variant iii!!!
218
+
219
+ # Tue Jun/25 + Wed Jun/26 -- NO
220
+ # Jun/25 + Jun/26 -- NO
204
221
  # Jun/25 .. 26 - why? why not???
205
222
  # Jun/25 to 26 - why? why not???
206
223
  # Jun/25 + 26 - add - why? why not???
207
224
  # Sun-Wed Jun/23-26 - add - why? why not???
208
225
  # Wed+Thu Jun/26+27 2024 - add - why? why not???
209
226
  #
210
- # maybe use comman and plus for list of dates
227
+ # maybe use comma and plus for list of dates
211
228
  # Tue Jun/25, Wed Jun/26, Thu Jun/27 ??
212
229
  # Tue Jun/25 + Wed Jun/26 + Thu Jun/27 ??
213
230
  #
214
231
  # add back optional comma (before) year - why? why not?
232
+ #
215
233
 
216
234
 
217
235
  ##
218
236
  # todo add plus later on - why? why not?
237
+ ### todo/fix add optional comma (,) before year
238
+
239
+ ### regex note/tip/remindr - \b () \b MUST always get enclosed in parantheses
240
+ ## because alternation (|) has lowest priority/binding
241
+
219
242
 
220
243
  DURATION_I_RE = %r{
221
244
  (?<duration>
222
245
  \b
246
+ (?:
223
247
  ## optional day name
224
248
  ((?<day_name1>#{DAY_NAMES})
225
249
  [ ]
@@ -228,12 +252,13 @@ DURATION_I_RE = %r{
228
252
  (?: \/|[ ] )
229
253
  (?<day1>\d{1,2})
230
254
  ## optional year
231
- ( [ ]
255
+ ( ,? # optional comma
256
+ [ ]
232
257
  (?<year1>\d{4})
233
258
  )?
234
259
 
235
260
  ## support + and - (add .. or such - why??)
236
- [ ]*[-][ ]*
261
+ [ ]* - [ ]*
237
262
 
238
263
  ## optional day name
239
264
  ((?<day_name2>#{DAY_NAMES})
@@ -243,20 +268,28 @@ DURATION_I_RE = %r{
243
268
  (?: \/|[ ] )
244
269
  (?<day2>\d{1,2})
245
270
  ## optional year
246
- ( [ ]
271
+ ( ,? # optional comma
272
+ [ ]
247
273
  (?<year2>\d{4})
248
274
  )?
275
+ )
249
276
  \b
250
277
  )}ix
251
278
 
252
279
 
280
+
281
+ # FIX - remove this variant
282
+ # "standardize on month day [year]" !!!!
283
+
284
+ =begin
253
285
  ###
254
286
  # variant ii
255
287
  # e.g. 26 July - 27 July
256
-
257
- DURATION_II_RE = %r{
288
+ # 26 July,
289
+ XXX_DURATION_II_RE = %r{
258
290
  (?<duration>
259
291
  \b
292
+ (?
260
293
  ## optional day name
261
294
  ((?<day_name1>#{DAY_NAMES})
262
295
  [ ]
@@ -265,7 +298,8 @@ DURATION_II_RE = %r{
265
298
  [ ]
266
299
  (?<month_name1>#{MONTH_NAMES})
267
300
  ## optional year
268
- ( [ ]
301
+ (
302
+ [ ]
269
303
  (?<year1>\d{4})
270
304
  )?
271
305
 
@@ -283,16 +317,50 @@ DURATION_II_RE = %r{
283
317
  ( [ ]
284
318
  (?<year2>\d{4})
285
319
  )?
320
+ )
321
+ \b
322
+ )}ix
323
+ =end
324
+
325
+
326
+ # variant ii
327
+ # add support for shorthand
328
+ # August 16-18, 2011
329
+ # September 13-15, 2011
330
+ # October 18-20, 2011
331
+ # March/6-8, 2012
332
+ # March 6-8 2012
333
+ # March 6-8
334
+ #
335
+ # - add support for August 16+17 or such (and check 16+18)
336
+ # use <op> to check if day2 is a plus or range or such - why? why not?
337
+
338
+ DURATION_II_RE = %r{
339
+ (?<duration>
340
+ \b
341
+ (?:
342
+ (?<month_name1>#{MONTH_NAMES})
343
+ [ /]
344
+ (?<day1>\d{1,2})
345
+ -
346
+ (?<day2>\d{1,2})
347
+ (?:
348
+ ,? ## optional comma
349
+ [ ]
350
+ (?<year1>\d{4})
351
+ )? ## optional year
352
+ )
286
353
  \b
287
354
  )}ix
288
355
 
289
356
 
357
+
290
358
  #############################################
291
359
  # map tables
292
360
  # note: order matters; first come-first matched/served
293
361
  DURATION_RE = Regexp.union(
294
362
  DURATION_I_RE,
295
- DURATION_II_RE
363
+ DURATION_II_RE,
296
364
  )
297
365
 
298
366
 
@@ -0,0 +1,45 @@
1
+
2
+ module SportDb
3
+ class Lexer
4
+
5
+ #
6
+ # todo/check - move goal type regexes to goal or somewhere else?
7
+ #
8
+
9
+ ## goal types
10
+ # (pen.) or (pen) or (p.) or (p)
11
+ ## (o.g.) or (og)
12
+ ## todo/check - keep case-insensitive
13
+ ## or allow OG or P or PEN or
14
+ ## only lower case - why? why not?
15
+ GOAL_PEN_RE = %r{
16
+ (?<pen> \(
17
+ (?:pen|p)\.?
18
+ \)
19
+ )
20
+ }ix
21
+ GOAL_OG_RE = %r{
22
+ (?<og> \(
23
+ (?:og|o\.g\.)
24
+ \)
25
+ )
26
+ }ix
27
+
28
+
29
+ MINUTE_RE = %r{
30
+ (?<minute>
31
+ (?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
32
+ # todo - add more lookbehinds e.g. ,) etc. - why? why not?
33
+ (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
34
+ (?: \+
35
+ (?<value2>\d{1,3})
36
+ )?
37
+ ' ## must have minute marker!!!!
38
+ )
39
+ }ix
40
+
41
+
42
+
43
+
44
+ end # module SportDb
45
+ end # class Lexer
@@ -0,0 +1,133 @@
1
+ ###
2
+ ## team prop mode e.g.
3
+ ##
4
+ ##
5
+ ## Fri Jun 14 21:00 @ München Fußball Arena, München
6
+ ## (1) Germany v Scotland 5-1 (3-0)
7
+ ## Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.)
8
+ ##
9
+ ## Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
10
+ ## Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
11
+ ## Havertz (Füllkrug 63')
12
+ ## Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
13
+ ## McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
14
+ ## Adams (Hanley 46'), McGinn (McLean 67')
15
+
16
+
17
+ module SportDb
18
+ class Lexer
19
+
20
+
21
+ ## name different from text (does NOT allow number in name/text)
22
+
23
+ PROP_NAME_RE = %r{
24
+ (?<prop_name> \b
25
+ (?<name>
26
+ \p{L}+
27
+ \.? ## optional dot
28
+ (?:
29
+ [ ]? # only single spaces allowed inline!!!
30
+ (?:
31
+ (?:
32
+ (?<=\p{L}) ## use lookbehind
33
+ [/'-] ## must be surrounded by letters
34
+ ## e.g. One/Two NOT
35
+ ## One/ Two or One / Two or One /Two etc.
36
+ (?=\p{L}) ## use lookahead
37
+ )
38
+ |
39
+ (?:
40
+ (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
41
+ ['] ## must be surrounded by leading space and
42
+ ## traling letters (e.g. UDI 'Beter Bed)
43
+ (?=\p{L}) ## use lookahead
44
+ )
45
+ |
46
+ (?:
47
+ (?<=\p{L}) ## use lookbehind
48
+ ['] ## must be surrounded by leading letter and
49
+ ## trailing space PLUS letter (e.g. UDI' Beter Bed)
50
+ (?=[ ]\p{L}) ## use lookahead (space WITH letter
51
+ )
52
+ | ## standard case with letter(s) and optinal dot
53
+ (?: \p{L}+
54
+ \.? ## optional dot
55
+ )
56
+ )+
57
+ )*
58
+ )
59
+ ## add lookahead - must be non-alphanum
60
+ (?=[ ,;\]\)]|$)
61
+ )
62
+ }ix
63
+
64
+
65
+
66
+
67
+ ##############
68
+ # add support for props/ attributes e.g.
69
+ #
70
+ # Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
71
+ # Kroos (80' Can) - Musiala (74' Müller), Gündogan,
72
+ # Wirtz (63' Sane) - Havertz (63' Füllkrug)
73
+ # Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (78' McKenna) - Ralston [Y],
74
+ # McTominay, McGregor (67' Gilmour), Robertson - Christie (82' Shankland),
75
+ # Adams (46' Hanley), McGinn (67' McLean)
76
+ #
77
+ ## note: colon (:) MUST be followed by one (or more) spaces
78
+ ## make sure mon feb 12 18:10 will not match
79
+ ## allow 1. FC Köln etc.
80
+ ## Mainz 05:
81
+ ## limit to 30 chars max
82
+ ## only allow chars incl. intl but (NOT ()[]/;)
83
+ ##
84
+ ## todo/fix:
85
+ ## check if St. Pölten works; with starting St. ???
86
+
87
+
88
+ PROP_KEY_RE = %r{
89
+ (?<prop_key> \b
90
+ (?<key>
91
+ (?:\p{L}+
92
+ |
93
+ \d+ # check for num lookahead (MUST be space or dot)
94
+ ## MUST be followed by (optional dot) and
95
+ ## required space !!!
96
+ ## MUST be follow by a to z!!!!
97
+ \.? ## optional dot
98
+ [ ]? ## make space optional too - why? why not?
99
+ ## yes - eg. 1st, 2nd, 5th etc.
100
+ \p{L}+
101
+ )
102
+ [\d\p{L}'/° -]*? ## allow almost anyting
103
+ ## fix - add negative lookahead
104
+ ## no space and dash etc.
105
+ ## only allowed "inline" not at the end
106
+ ## must end with latter or digit!
107
+ )
108
+ [ ]*? # slurp trailing spaces
109
+ :
110
+ (?=[ ]+) ## possitive lookahead (must be followed by space!!)
111
+ )
112
+ }ix
113
+
114
+
115
+
116
+ PROP_BASICS_RE = %r{
117
+ (?<spaces> [ ]{2,}) |
118
+ (?<space> [ ])
119
+ |
120
+ (?<sym>
121
+ [;,\(\)\[\]-]
122
+ )
123
+ }ix
124
+
125
+ PROP_RE = Regexp.union(
126
+ PROP_BASICS_RE,
127
+ MINUTE_RE,
128
+ PROP_NAME_RE,
129
+ )
130
+
131
+
132
+ end # class Lexer
133
+ end # module SportDb
@@ -17,7 +17,7 @@ class Lexer
17
17
  ## 3-4 pen. 2-2 a.e.t.
18
18
  ## 2-2 a.e.t.
19
19
  SCORE__P_ET__RE = %r{
20
- (?<score>
20
+ (?<score_more>
21
21
  \b
22
22
  (?:
23
23
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
@@ -34,7 +34,7 @@ class Lexer
34
34
  ## note: allow SPECIAL with penalty only
35
35
  ## 3-4 pen.
36
36
  SCORE__P__RE = %r{
37
- (?<score>
37
+ (?<score_more>
38
38
  \b
39
39
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
40
40
  [ ]* #{P_EN}
@@ -52,7 +52,7 @@ class Lexer
52
52
  ## 2-2 a.e.t. (1-1)
53
53
 
54
54
  SCORE__P_ET_FT_HT__RE = %r{
55
- (?<score>
55
+ (?<score_more>
56
56
  \b
57
57
  (?:
58
58
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
@@ -79,7 +79,7 @@ class Lexer
79
79
  ## special case for case WITHOUT extra time!!
80
80
  ## same as above (but WITHOUT extra time and pen required)
81
81
  SCORE__P_FT_HT__RE = %r{
82
- (?<score>
82
+ (?<score_more>
83
83
  \b
84
84
  (?<p1>\d{1,2}) - (?<p2>\d{1,2})
85
85
  [ ]* #{P_EN} [ ]+
@@ -99,36 +99,47 @@ class Lexer
99
99
  ## note: \b works only after non-alphanum e.g. )
100
100
 
101
101
 
102
-
103
- ## e.g. 2-1 (1-1) or
104
- ## 2-1
105
-
102
+ ##########
103
+ ## e.g. 2-1 (1-1)
106
104
  SCORE__FT_HT__RE = %r{
107
- (?<score>
105
+ (?<score_more>
108
106
  \b
109
107
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
110
- (?:
111
108
  [ ]+ \( [ ]*
112
109
  (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
113
110
  [ ]* \)
114
- )? # note: make half time (HT) score optional for now
115
111
  (?=[ ,\]]|$)
116
112
  )}ix ## todo/check: remove loakahead assertion here - why require space?
117
113
  ## note: \b works only after non-alphanum e.g. )
118
114
 
119
-
115
+ #####
116
+ ## 2-1
117
+ SCORE__FT__RE = %r{
118
+ (?<score>
119
+ \b
120
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
121
+ \b
122
+ )}ix
120
123
 
121
124
  #############################################
122
125
  # map tables
123
126
  # note: order matters; first come-first matched/served
127
+ #
128
+ ## check - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or ___ - why? why not?
124
129
 
125
- SCORE_RE = Regexp.union(
130
+ SCORE_MORE_RE = Regexp.union(
126
131
  SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
127
132
  SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1)
128
133
  SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t.
129
134
  SCORE__P__RE, # e.g. 5-1 pen.
130
- SCORE__FT_HT__RE, # e.g. 1-1 (1-0) or 1-1 -- note - must go last!!!
135
+ SCORE__FT_HT__RE, # e.g. 1-1 (1-0)
136
+ ## note - keep basic score as its own token!!!!
137
+ ## that is, SCORE & SCORE_MORE
138
+ ### SCORE__FT__RE, # e.g. 1-1 -- note - must go last!!!
131
139
  )
132
140
 
141
+ SCORE_RE = SCORE__FT__RE
142
+
143
+
133
144
  end # class Lexer
134
145
  end # module SportDb
@@ -24,6 +24,13 @@ class Lexer
24
24
  # allow Cote'd Ivoir or such
25
25
  ## e.g. add '
26
26
 
27
+ ## note:
28
+ ## make sure these do NOT match!!!
29
+ ## TEXT => "Matchday 1 / Group A"
30
+ ## TEXT => "Matchday 2 / Group A"
31
+ ## TEXT => "Matchday 3 / Group A"
32
+
33
+
27
34
 
28
35
  TEXT_RE = %r{
29
36
  ## must start with alpha (allow unicode letters!!)
@@ -59,11 +66,11 @@ TEXT_RE = %r{
59
66
  ## AND switch to case-sensitive (via -i!!!)
60
67
  )
61
68
  | # only single spaces allowed inline!!!
62
- [-]
69
+ [-/]
63
70
  )?
64
71
  (?:
65
72
  \p{L} |
66
- [&/'°]
73
+ [&'°]
67
74
  |
68
75
  (?:
69
76
  \d+