sportdb-parser 0.5.9 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,7 +14,7 @@ def initialize( txt, debug: false )
14
14
 
15
15
  ### todo:
16
16
  ## - pass along debug flag
17
- lexer = SportDb::Lexer.new( txt )
17
+ lexer = SportDb::Lexer.new( txt, debug: debug )
18
18
  ## note - use tokenize_with_errors and add/collect tokenize errors
19
19
  @tokens, @errors = lexer.tokenize_with_errors
20
20
  ## pp @tokens
@@ -71,10 +71,14 @@ RoundDef = Struct.new( :name, :date, :duration ) do
71
71
  end
72
72
  end
73
73
 
74
- DateHeader = Struct.new( :date ) do
74
+ DateHeader = Struct.new( :date, :time, :geo, :timezone ) do
75
75
  def pretty_print( printer )
76
76
  printer.text( "<DateHeader " )
77
- printer.text( "#{self.date.pretty_inspect}>" )
77
+ printer.text( "#{self.date.pretty_inspect}" )
78
+ printer.text( " time=#{self.time.pretty_inspect}" ) if self.time
79
+ printer.text( " geo=#{self.geo.pretty_inspect}" ) if self.geo
80
+ printer.text( " timezone=#{self.timezone}") if self.timezone
81
+ printer.text( ">")
78
82
  end
79
83
  end
80
84
 
@@ -85,14 +89,17 @@ GroupHeader = Struct.new( :name ) do
85
89
  end
86
90
  end
87
91
 
88
- RoundHeader = Struct.new( :names ) do
92
+ RoundHeader = Struct.new( :names, :group ) do
89
93
  def pretty_print( printer )
90
94
  printer.text( "<RoundHeader " )
91
- printer.text( "#{self.names.join(', ')}>" )
95
+ printer.text( "#{self.names.join(', ')}" )
96
+ printer.text( " group=#{self.group}") if self.group
97
+ printer.text( ">" )
92
98
  end
93
99
  end
94
100
 
95
- MatchLine = Struct.new( :ord, :date, :time,
101
+
102
+ MatchLine = Struct.new( :ord, :date, :time, :wday,
96
103
  :team1, :team2, :score,
97
104
  :status,
98
105
  :geo,
@@ -146,12 +146,29 @@ DATE_II_RE = %r{
146
146
  )}ix
147
147
 
148
148
 
149
+ # e.g. iso-date - 2011-08-25
150
+ ## todo/check - allow 2011-8-25 or 2011-8-3 / 2011-08-03 etc. - why? why not?
151
+ DATE_III_RE = %r{
152
+ (?<date>
153
+ \b
154
+ (?<year>\d{4})
155
+ -
156
+ (?<month>\d{2})
157
+ -
158
+ (?<day>\d{2})
159
+ \b
160
+ )}ix
161
+
162
+
163
+
164
+
149
165
  #############################################
150
166
  # map tables
151
167
  # note: order matters; first come-first matched/served
152
168
  DATE_RE = Regexp.union(
153
169
  DATE_I_RE,
154
- DATE_II_RE
170
+ DATE_II_RE,
171
+ DATE_III_RE,
155
172
  )
156
173
 
157
174
 
@@ -0,0 +1,45 @@
1
+
2
+ module SportDb
3
+ class Lexer
4
+
5
+ #
6
+ # todo/check - move goal type regexes to goal or somewhere else?
7
+ #
8
+
9
+ ## goal types
10
+ # (pen.) or (pen) or (p.) or (p)
11
+ ## (o.g.) or (og)
12
+ ## todo/check - keep case-insensitive
13
+ ## or allow OG or P or PEN or
14
+ ## only lower case - why? why not?
15
+ GOAL_PEN_RE = %r{
16
+ (?<pen> \(
17
+ (?:pen|p)\.?
18
+ \)
19
+ )
20
+ }ix
21
+ GOAL_OG_RE = %r{
22
+ (?<og> \(
23
+ (?:og|o\.g\.)
24
+ \)
25
+ )
26
+ }ix
27
+
28
+
29
+ MINUTE_RE = %r{
30
+ (?<minute>
31
+ (?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
32
+ # todo - add more lookbehinds e.g. ,) etc. - why? why not?
33
+ (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
34
+ (?: \+
35
+ (?<value2>\d{1,3})
36
+ )?
37
+ ' ## must have minute marker!!!!
38
+ )
39
+ }ix
40
+
41
+
42
+
43
+
44
+ end # module SportDb
45
+ end # class Lexer
@@ -0,0 +1,133 @@
1
+ ###
2
+ ## team prop mode e.g.
3
+ ##
4
+ ##
5
+ ## Fri Jun 14 21:00 @ München Fußball Arena, München
6
+ ## (1) Germany v Scotland 5-1 (3-0)
7
+ ## Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.)
8
+ ##
9
+ ## Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
10
+ ## Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
11
+ ## Havertz (Füllkrug 63')
12
+ ## Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
13
+ ## McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
14
+ ## Adams (Hanley 46'), McGinn (McLean 67')
15
+
16
+
17
+ module SportDb
18
+ class Lexer
19
+
20
+
21
+ ## name different from text (does NOT allow number in name/text)
22
+
23
+ PROP_NAME_RE = %r{
24
+ (?<prop_name> \b
25
+ (?<name>
26
+ \p{L}+
27
+ \.? ## optional dot
28
+ (?:
29
+ [ ]? # only single spaces allowed inline!!!
30
+ (?:
31
+ (?:
32
+ (?<=\p{L}) ## use lookbehind
33
+ [/'-] ## must be surrounded by letters
34
+ ## e.g. One/Two NOT
35
+ ## One/ Two or One / Two or One /Two etc.
36
+ (?=\p{L}) ## use lookahead
37
+ )
38
+ |
39
+ (?:
40
+ (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
41
+ ['] ## must be surrounded by leading space and
42
+ ## traling letters (e.g. UDI 'Beter Bed)
43
+ (?=\p{L}) ## use lookahead
44
+ )
45
+ |
46
+ (?:
47
+ (?<=\p{L}) ## use lookbehind
48
+ ['] ## must be surrounded by leading letter and
49
+ ## trailing space PLUS letter (e.g. UDI' Beter Bed)
50
+ (?=[ ]\p{L}) ## use lookahead (space WITH letter
51
+ )
52
+ | ## standard case with letter(s) and optinal dot
53
+ (?: \p{L}+
54
+ \.? ## optional dot
55
+ )
56
+ )+
57
+ )*
58
+ )
59
+ ## add lookahead - must be non-alphanum
60
+ (?=[ ,;\]\)]|$)
61
+ )
62
+ }ix
63
+
64
+
65
+
66
+
67
+ ##############
68
+ # add support for props/ attributes e.g.
69
+ #
70
+ # Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
71
+ # Kroos (80' Can) - Musiala (74' Müller), Gündogan,
72
+ # Wirtz (63' Sane) - Havertz (63' Füllkrug)
73
+ # Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (78' McKenna) - Ralston [Y],
74
+ # McTominay, McGregor (67' Gilmour), Robertson - Christie (82' Shankland),
75
+ # Adams (46' Hanley), McGinn (67' McLean)
76
+ #
77
+ ## note: colon (:) MUST be followed by one (or more) spaces
78
+ ## make sure mon feb 12 18:10 will not match
79
+ ## allow 1. FC Köln etc.
80
+ ## Mainz 05:
81
+ ## limit to 30 chars max
82
+ ## only allow chars incl. intl but (NOT ()[]/;)
83
+ ##
84
+ ## todo/fix:
85
+ ## check if St. Pölten works; with starting St. ???
86
+
87
+
88
+ PROP_KEY_RE = %r{
89
+ (?<prop_key> \b
90
+ (?<key>
91
+ (?:\p{L}+
92
+ |
93
+ \d+ # check for num lookahead (MUST be space or dot)
94
+ ## MUST be followed by (optional dot) and
95
+ ## required space !!!
96
+ ## MUST be follow by a to z!!!!
97
+ \.? ## optional dot
98
+ [ ]? ## make space optional too - why? why not?
99
+ ## yes - eg. 1st, 2nd, 5th etc.
100
+ \p{L}+
101
+ )
102
+ [\d\p{L}'/° -]*? ## allow almost anyting
103
+ ## fix - add negative lookahead
104
+ ## no space and dash etc.
105
+ ## only allowed "inline" not at the end
106
+ ## must end with latter or digit!
107
+ )
108
+ [ ]*? # slurp trailing spaces
109
+ :
110
+ (?=[ ]+) ## possitive lookahead (must be followed by space!!)
111
+ )
112
+ }ix
113
+
114
+
115
+
116
+ PROP_BASICS_RE = %r{
117
+ (?<spaces> [ ]{2,}) |
118
+ (?<space> [ ])
119
+ |
120
+ (?<sym>
121
+ [;,\(\)\[\]-]
122
+ )
123
+ }ix
124
+
125
+ PROP_RE = Regexp.union(
126
+ PROP_BASICS_RE,
127
+ MINUTE_RE,
128
+ PROP_NAME_RE,
129
+ )
130
+
131
+
132
+ end # class Lexer
133
+ end # module SportDb
@@ -24,6 +24,13 @@ class Lexer
24
24
  # allow Cote'd Ivoir or such
25
25
  ## e.g. add '
26
26
 
27
+ ## note:
28
+ ## make sure these do NOT match!!!
29
+ ## TEXT => "Matchday 1 / Group A"
30
+ ## TEXT => "Matchday 2 / Group A"
31
+ ## TEXT => "Matchday 3 / Group A"
32
+
33
+
27
34
 
28
35
  TEXT_RE = %r{
29
36
  ## must start with alpha (allow unicode letters!!)
@@ -59,11 +66,11 @@ TEXT_RE = %r{
59
66
  ## AND switch to case-sensitive (via -i!!!)
60
67
  )
61
68
  | # only single spaces allowed inline!!!
62
- [-]
69
+ [-/]
63
70
  )?
64
71
  (?:
65
72
  \p{L} |
66
- [&/'°]
73
+ [&'°]
67
74
  |
68
75
  (?:
69
76
  \d+
@@ -7,13 +7,14 @@ class Lexer
7
7
  ##
8
8
  # keep 18h30 - why? why not?
9
9
  # add support for 6:30pm 8:20am etc. - why? why not?
10
-
10
+ #
11
+ # check - only support h e.g. 18h30 or 18H30 too - why? why not?
12
+ # e.g. 18.30 (or 18:30 or 18h30)
11
13
  TIME_RE = %r{
12
- ## e.g. 18.30 (or 18:30 or 18h30)
13
14
  (?<time> \b
14
- (?<hour>\d{1,2})
15
+ (?: (?<hour>\d{1,2})
15
16
  (?: :|\.|h )
16
- (?<minute>\d{2})
17
+ (?<minute>\d{2}))
17
18
  \b
18
19
  )
19
20
  }ix
@@ -42,9 +43,12 @@ TIME_RE = %r{
42
43
  # https://en.wikipedia.org/wiki/Time_zone
43
44
  # https://en.wikipedia.org/wiki/List_of_UTC_offsets
44
45
  # https://en.wikipedia.org/wiki/UTC−04:00 etc.
45
-
46
+ #
47
+ # e.g. (UTC-2) or (CEST/UTC-2) etc.
48
+ # todo check - only allow upcase
49
+ # or (utc-2) and (cest/utc-2) too - why? why not?
50
+
46
51
  TIMEZONE_RE = %r{
47
- ## e.g. (UTC-2) or (CEST/UTC-2) etc.
48
52
  (?<timezone>
49
53
  \(
50
54
  ## optional "local" timezone name eg. BRT or CEST etc.
@@ -60,6 +64,35 @@ TIMEZONE_RE = %r{
60
64
 
61
65
 
62
66
 
67
+ ## add wday / stand-alone week day - as separate regex or
68
+ ## use TEXT with is_wday? check or such with
69
+ ## requirement of beginning of line (anchored to line) only??
70
+ ## - why? why not?
71
+
72
+ WDAY_RE = %r{
73
+ (?<wday>
74
+ \b # note - alternation (|) is lowest precedence (such
75
+ # parathenes required around \b()\b !!!
76
+ ## note - NOT case sensitive!!!
77
+ (?<day_name>
78
+ (?-i:
79
+ Mon|Mo|
80
+ Tue|Tu|
81
+ Wed|We|
82
+ Thu|Th|
83
+ Fri|Fr|
84
+ Sat|Sa|
85
+ Sun|Su
86
+ ))
87
+ \b ## todo/check - must be followed by two spaces or space + [( etc.
88
+ ## to allow words starting with weekday abbrevations - why? why not?
89
+ ## check if any names (teams, rounds, etc) come up in practice
90
+ ## or maybe remove three letter abbrevations Mon/Tue
91
+ ## and keep only Mo/Tu/We etc. - why? why not?
92
+ )}x
93
+
94
+
95
+
63
96
 
64
97
  BASICS_RE = %r{
65
98
  ## e.g. (51) or (1) etc. - limit digits of number???
@@ -78,189 +111,22 @@ BASICS_RE = %r{
78
111
  (?<spaces> [ ]{2,}) |
79
112
  (?<space> [ ])
80
113
  |
81
- (?<sym>[;,@|\[\]-])
82
- }ix
83
-
84
-
85
- ## removed from basics
86
- =begin
87
- (?<none>
88
- (?<=[ \[]|^) # Positive lookbehind for space or [
89
- -
90
- (?=[ ]*;) # positive lookahead for space
91
- )
92
- |
93
- (?<vs>
94
- (?<=[ ]) # Positive lookbehind for space
95
- (?:
96
- vs\.?| ## allow optional dot (eg. vs. v.)
97
- v\.?|
98
- -
99
- ) # not bigger match first e.g. vs than v etc.
100
- (?=[ ]) # positive lookahead for space
101
- )
102
- |
103
-
104
- make - into a simple symbol !!!
105
- =end
106
-
107
-
108
- MINUTE_RE = %r{
109
- (?<minute>
110
- (?<=[ (]) # Positive lookbehind for space or opening ( e.g. (61') required
111
- (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
112
- (?: \+
113
- (?<value2>\d{1,3})
114
- )?
115
- ' ## must have minute marker!!!!
116
- )
117
- }ix
118
-
119
-
120
- ## goal types
121
- # (pen.) or (pen) or (p.) or (p)
122
- ## (o.g.) or (og)
123
- GOAL_PEN_RE = %r{
124
- (?<pen> \(
125
- (?:pen|p)\.?
126
- \)
127
- )
128
- }ix
129
- GOAL_OG_RE = %r{
130
- (?<og> \(
131
- (?:og|o\.g\.)
132
- \)
133
- )
114
+ (?<sym>[;,/@|\[\]-])
134
115
  }ix
135
116
 
136
117
 
137
118
 
138
119
 
139
-
140
-
141
- PROP_BASICS_RE = %r{
142
- (?<spaces> [ ]{2,}) |
143
- (?<space> [ ])
144
- |
145
- (?<sym>[.;,\(\)\[\]-]) ## note - dot (.) is the (all-important) end-of-prop marker!!!
146
- }ix
147
-
148
-
149
- ## name different from text (does not allow number in name/text)
150
- ##
151
- ## note - includes special handling for dot (.) if at the end of line!!!
152
- ## end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
153
-
154
- PROP_NAME_RE = %r{
155
- (?<prop_name> \b
156
- (?<name>
157
- \p{L}+
158
- (?: \. (?: (?![ ]*$) )
159
- )? ## edge case - check for end of prop marker! (e.g. Stop.)
160
- (?:
161
- [ ]? # only single spaces allowed inline!!!
162
- (?:
163
- (?:
164
- (?<=\p{L}) ## use lookbehind
165
- [/'-] ## must be surrounded by letters
166
- ## e.g. One/Two NOT
167
- ## One/ Two or One / Two or One /Two etc.
168
- (?=\p{L}) ## use lookahead
169
- )
170
- |
171
- (?:
172
- (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
173
- ['] ## must be surrounded by leading space and
174
- ## traling letters (e.g. UDI 'Beter Bed)
175
- (?=\p{L}) ## use lookahead
176
- )
177
- |
178
- (?:
179
- (?<=\p{L}) ## use lookbehind
180
- ['] ## must be surrounded by leading letter and
181
- ## trailing space PLUS letter (e.g. UDI' Beter Bed)
182
- (?=[ ]\p{L}) ## use lookahead (space WITH letter
183
- )
184
- |
185
- (?: \p{L}+
186
- (?: \.
187
- (?: (?![ ]*$) )
188
- )? ## last dot is delimiter!!!
189
- )
190
- )+
191
- )*
192
- )
193
- ## add lookahead - must be non-alphanum (or dot)
194
- (?=[ .,;\]\)]|$)
195
- )
196
- }ix
197
-
198
-
199
-
200
-
201
- ##############
202
- # add support for props/ attributes e.g.
203
- #
204
- # Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt – Andrich [Y] (46' Groß),
205
- # Kroos (80' Can) – Musiala (74' Müller), Gündogan,
206
- # Wirtz (63' Sane) – Havertz (63' Füllkrug).
207
- # Scotland: Gunn – Porteous [R 44'], Hendry, Tierney (78' McKenna) – Ralston [Y],
208
- # McTominay, McGregor (67' Gilmour), Robertson – Christie (82' Shankland),
209
- # Adams (46' Hanley), McGinn (67' McLean).
210
- #
211
- ## note: colon (:) MUST be followed by one (or more) spaces
212
- ## make sure mon feb 12 18:10 will not match
213
- ## allow 1. FC Köln etc.
214
- ## Mainz 05:
215
- ## limit to 30 chars max
216
- ## only allow chars incl. intl but (NOT ()[]/;)
217
-
218
-
219
- PROP_KEY_RE = %r{
220
- (?<prop_key> \b
221
- (?<key>
222
- (?:\p{L}+
223
- |
224
- \d+ # check for num lookahead (MUST be space or dot)
225
- ## MUST be followed by (optional dot) and
226
- ## required space !!!
227
- ## MUST be follow by a to z!!!!
228
- \.? ## optional dot
229
- [ ]? ## make space optional too - why? why not?
230
- ## yes - eg. 1st, 2nd, 5th etc.
231
- \p{L}+
232
- )
233
- [\d\p{L}'/° -]*? ## allow almost anyting
234
- ## fix - add negative lookahead
235
- ## no space and dash etc.
236
- ## only allowed "inline" not at the end
237
- ## must end with latter or digit!
238
- )
239
- [ ]*? # slurp trailing spaces
240
- :
241
- (?=[ ]+) ## possitive lookahead (must be followed by space!!)
242
- )
243
- }ix
244
-
245
-
246
-
247
-
248
- PROP_RE = Regexp.union(
249
- PROP_BASICS_RE,
250
- MINUTE_RE,
251
- PROP_NAME_RE,
252
- )
253
-
254
-
255
-
256
120
  RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
257
121
  STATUS_RE,
258
122
  TIMEZONE_RE,
259
123
  TIME_RE,
260
124
  DURATION_RE, # note - duration MUST match before date
261
125
  DATE_RE,
126
+ WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
262
127
  SCORE_RE,
263
- BASICS_RE, MINUTE_RE,
128
+ BASICS_RE,
129
+ MINUTE_RE,
264
130
  GOAL_OG_RE, GOAL_PEN_RE,
265
131
  TEXT_RE )
266
132
 
@@ -3,8 +3,8 @@ module SportDb
3
3
  module Module
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
- MINOR = 5
7
- PATCH = 9
6
+ MINOR = 6
7
+ PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -21,6 +21,8 @@ require_relative 'parser/token-score'
21
21
  require_relative 'parser/token-date'
22
22
  require_relative 'parser/token-text'
23
23
  require_relative 'parser/token-status'
24
+ require_relative 'parser/token-minute'
25
+ require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
24
26
  require_relative 'parser/token'
25
27
  require_relative 'parser/lexer'
26
28
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.9
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-29 00:00:00.000000000 Z
11
+ date: 2025-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos
@@ -102,6 +102,8 @@ files:
102
102
  - lib/sportdb/parser/racc_parser.rb
103
103
  - lib/sportdb/parser/racc_tree.rb
104
104
  - lib/sportdb/parser/token-date.rb
105
+ - lib/sportdb/parser/token-minute.rb
106
+ - lib/sportdb/parser/token-prop.rb
105
107
  - lib/sportdb/parser/token-score.rb
106
108
  - lib/sportdb/parser/token-status.rb
107
109
  - lib/sportdb/parser/token-text.rb