sportdb-parser 0.5.8 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,7 @@ def initialize( txt, debug: false )
14
14
 
15
15
  ### todo:
16
16
  ## - pass along debug flag
17
- lexer = SportDb::Lexer.new( txt )
17
+ lexer = SportDb::Lexer.new( txt, debug: debug )
18
18
  ## note - use tokenize_with_errors and add/collect tokenize errors
19
19
  @tokens, @errors = lexer.tokenize_with_errors
20
20
  ## pp @tokens
@@ -71,10 +71,14 @@ RoundDef = Struct.new( :name, :date, :duration ) do
71
71
  end
72
72
  end
73
73
 
74
- DateHeader = Struct.new( :date ) do
74
+ DateHeader = Struct.new( :date, :time, :geo, :timezone ) do
75
75
  def pretty_print( printer )
76
76
  printer.text( "<DateHeader " )
77
- printer.text( "#{self.date.pretty_inspect}>" )
77
+ printer.text( "#{self.date.pretty_inspect}" )
78
+ printer.text( " time=#{self.time.pretty_inspect}" ) if self.time
79
+ printer.text( " geo=#{self.geo.pretty_inspect}" ) if self.geo
80
+ printer.text( " timezone=#{self.timezone}") if self.timezone
81
+ printer.text( ">")
78
82
  end
79
83
  end
80
84
 
@@ -85,14 +89,17 @@ GroupHeader = Struct.new( :name ) do
85
89
  end
86
90
  end
87
91
 
88
- RoundHeader = Struct.new( :names ) do
92
+ RoundHeader = Struct.new( :names, :group ) do
89
93
  def pretty_print( printer )
90
94
  printer.text( "<RoundHeader " )
91
- printer.text( "#{self.names.join(', ')}>" )
95
+ printer.text( "#{self.names.join(', ')}" )
96
+ printer.text( " group=#{self.group}") if self.group
97
+ printer.text( ">" )
92
98
  end
93
99
  end
94
100
 
95
- MatchLine = Struct.new( :ord, :date, :time,
101
+
102
+ MatchLine = Struct.new( :ord, :date, :time, :wday,
96
103
  :team1, :team2, :score,
97
104
  :status,
98
105
  :geo,
@@ -146,12 +146,29 @@ DATE_II_RE = %r{
146
146
  )}ix
147
147
 
148
148
 
149
+ # e.g. iso-date - 2011-08-25
150
+ ## todo/check - allow 2011-8-25 or 2011-8-3 / 2011-08-03 etc. - why? why not?
151
+ DATE_III_RE = %r{
152
+ (?<date>
153
+ \b
154
+ (?<year>\d{4})
155
+ -
156
+ (?<month>\d{2})
157
+ -
158
+ (?<day>\d{2})
159
+ \b
160
+ )}ix
161
+
162
+
163
+
164
+
149
165
  #############################################
150
166
  # map tables
151
167
  # note: order matters; first come-first matched/served
152
168
  DATE_RE = Regexp.union(
153
169
  DATE_I_RE,
154
- DATE_II_RE
170
+ DATE_II_RE,
171
+ DATE_III_RE,
155
172
  )
156
173
 
157
174
 
@@ -0,0 +1,45 @@
1
+
2
+ module SportDb
3
+ class Lexer
4
+
5
+ #
6
+ # todo/check - move goal type regexes to goal or somewhere else?
7
+ #
8
+
9
+ ## goal types
10
+ # (pen.) or (pen) or (p.) or (p)
11
+ ## (o.g.) or (og)
12
+ ## todo/check - keep case-insensitive
13
+ ## or allow OG or P or PEN or
14
+ ## only lower case - why? why not?
15
+ GOAL_PEN_RE = %r{
16
+ (?<pen> \(
17
+ (?:pen|p)\.?
18
+ \)
19
+ )
20
+ }ix
21
+ GOAL_OG_RE = %r{
22
+ (?<og> \(
23
+ (?:og|o\.g\.)
24
+ \)
25
+ )
26
+ }ix
27
+
28
+
29
+ MINUTE_RE = %r{
30
+ (?<minute>
31
+ (?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
32
+ # todo - add more lookbehinds e.g. ,) etc. - why? why not?
33
+ (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
34
+ (?: \+
35
+ (?<value2>\d{1,3})
36
+ )?
37
+ ' ## must have minute marker!!!!
38
+ )
39
+ }ix
40
+
41
+
42
+
43
+
44
+ end # module SportDb
45
+ end # class Lexer
@@ -0,0 +1,133 @@
1
+ ###
2
+ ## team prop mode e.g.
3
+ ##
4
+ ##
5
+ ## Fri Jun 14 21:00 @ München Fußball Arena, München
6
+ ## (1) Germany v Scotland 5-1 (3-0)
7
+ ## Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.)
8
+ ##
9
+ ## Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
10
+ ## Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
11
+ ## Havertz (Füllkrug 63')
12
+ ## Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
13
+ ## McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
14
+ ## Adams (Hanley 46'), McGinn (McLean 67')
15
+
16
+
17
+ module SportDb
18
+ class Lexer
19
+
20
+
21
+ ## name different from text (does NOT allow number in name/text)
22
+
23
+ PROP_NAME_RE = %r{
24
+ (?<prop_name> \b
25
+ (?<name>
26
+ \p{L}+
27
+ \.? ## optional dot
28
+ (?:
29
+ [ ]? # only single spaces allowed inline!!!
30
+ (?:
31
+ (?:
32
+ (?<=\p{L}) ## use lookbehind
33
+ [/'-] ## must be surrounded by letters
34
+ ## e.g. One/Two NOT
35
+ ## One/ Two or One / Two or One /Two etc.
36
+ (?=\p{L}) ## use lookahead
37
+ )
38
+ |
39
+ (?:
40
+ (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
41
+ ['] ## must be surrounded by leading space and
42
+ ## traling letters (e.g. UDI 'Beter Bed)
43
+ (?=\p{L}) ## use lookahead
44
+ )
45
+ |
46
+ (?:
47
+ (?<=\p{L}) ## use lookbehind
48
+ ['] ## must be surrounded by leading letter and
49
+ ## trailing space PLUS letter (e.g. UDI' Beter Bed)
50
+ (?=[ ]\p{L}) ## use lookahead (space WITH letter
51
+ )
52
+ | ## standard case with letter(s) and optinal dot
53
+ (?: \p{L}+
54
+ \.? ## optional dot
55
+ )
56
+ )+
57
+ )*
58
+ )
59
+ ## add lookahead - must be non-alphanum
60
+ (?=[ ,;\]\)]|$)
61
+ )
62
+ }ix
63
+
64
+
65
+
66
+
67
+ ##############
68
+ # add support for props/ attributes e.g.
69
+ #
70
+ # Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
71
+ # Kroos (80' Can) - Musiala (74' Müller), Gündogan,
72
+ # Wirtz (63' Sane) - Havertz (63' Füllkrug)
73
+ # Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (78' McKenna) - Ralston [Y],
74
+ # McTominay, McGregor (67' Gilmour), Robertson - Christie (82' Shankland),
75
+ # Adams (46' Hanley), McGinn (67' McLean)
76
+ #
77
+ ## note: colon (:) MUST be followed by one (or more) spaces
78
+ ## make sure mon feb 12 18:10 will not match
79
+ ## allow 1. FC Köln etc.
80
+ ## Mainz 05:
81
+ ## limit to 30 chars max
82
+ ## only allow chars incl. intl but (NOT ()[]/;)
83
+ ##
84
+ ## todo/fix:
85
+ ## check if St. Pölten works; with starting St. ???
86
+
87
+
88
+ PROP_KEY_RE = %r{
89
+ (?<prop_key> \b
90
+ (?<key>
91
+ (?:\p{L}+
92
+ |
93
+ \d+ # check for num lookahead (MUST be space or dot)
94
+ ## MUST be followed by (optional dot) and
95
+ ## required space !!!
96
+ ## MUST be follow by a to z!!!!
97
+ \.? ## optional dot
98
+ [ ]? ## make space optional too - why? why not?
99
+ ## yes - eg. 1st, 2nd, 5th etc.
100
+ \p{L}+
101
+ )
102
+ [\d\p{L}'/° -]*? ## allow almost anyting
103
+ ## fix - add negative lookahead
104
+ ## no space and dash etc.
105
+ ## only allowed "inline" not at the end
106
+ ## must end with latter or digit!
107
+ )
108
+ [ ]*? # slurp trailing spaces
109
+ :
110
+ (?=[ ]+) ## possitive lookahead (must be followed by space!!)
111
+ )
112
+ }ix
113
+
114
+
115
+
116
+ PROP_BASICS_RE = %r{
117
+ (?<spaces> [ ]{2,}) |
118
+ (?<space> [ ])
119
+ |
120
+ (?<sym>
121
+ [;,\(\)\[\]-]
122
+ )
123
+ }ix
124
+
125
+ PROP_RE = Regexp.union(
126
+ PROP_BASICS_RE,
127
+ MINUTE_RE,
128
+ PROP_NAME_RE,
129
+ )
130
+
131
+
132
+ end # class Lexer
133
+ end # module SportDb
@@ -24,6 +24,13 @@ class Lexer
24
24
  # allow Cote'd Ivoir or such
25
25
  ## e.g. add '
26
26
 
27
+ ## note:
28
+ ## make sure these do NOT match!!!
29
+ ## TEXT => "Matchday 1 / Group A"
30
+ ## TEXT => "Matchday 2 / Group A"
31
+ ## TEXT => "Matchday 3 / Group A"
32
+
33
+
27
34
 
28
35
  TEXT_RE = %r{
29
36
  ## must start with alpha (allow unicode letters!!)
@@ -53,15 +60,17 @@ TEXT_RE = %r{
53
60
  \d+\.-\d+\. [ ]? \p{L}+
54
61
  )
55
62
 
56
- (?:(?: (?:[ ]
57
- (?!vs?[ ]) ## note - exclude (v[ ]/vs[ ])
58
- )
63
+ (?:(?: (?:[ ]
64
+ (?! (?-i: vs?[ ])
65
+ ) ## note - exclude (v[ ]/vs[ ])
66
+ ## AND switch to case-sensitive (via -i!!!)
67
+ )
59
68
  | # only single spaces allowed inline!!!
60
- [-]
69
+ [-/]
61
70
  )?
62
71
  (?:
63
72
  \p{L} |
64
- [&/'°]
73
+ [&'°]
65
74
  |
66
75
  (?:
67
76
  \d+
@@ -7,13 +7,14 @@ class Lexer
7
7
  ##
8
8
  # keep 18h30 - why? why not?
9
9
  # add support for 6:30pm 8:20am etc. - why? why not?
10
-
10
+ #
11
+ # check - only support h e.g. 18h30 or 18H30 too - why? why not?
12
+ # e.g. 18.30 (or 18:30 or 18h30)
11
13
  TIME_RE = %r{
12
- ## e.g. 18.30 (or 18:30 or 18h30)
13
14
  (?<time> \b
14
- (?<hour>\d{1,2})
15
+ (?: (?<hour>\d{1,2})
15
16
  (?: :|\.|h )
16
- (?<minute>\d{2})
17
+ (?<minute>\d{2}))
17
18
  \b
18
19
  )
19
20
  }ix
@@ -42,9 +43,12 @@ TIME_RE = %r{
42
43
  # https://en.wikipedia.org/wiki/Time_zone
43
44
  # https://en.wikipedia.org/wiki/List_of_UTC_offsets
44
45
  # https://en.wikipedia.org/wiki/UTC−04:00 etc.
45
-
46
+ #
47
+ # e.g. (UTC-2) or (CEST/UTC-2) etc.
48
+ # todo check - only allow upcase
49
+ # or (utc-2) and (cest/utc-2) too - why? why not?
50
+
46
51
  TIMEZONE_RE = %r{
47
- ## e.g. (UTC-2) or (CEST/UTC-2) etc.
48
52
  (?<timezone>
49
53
  \(
50
54
  ## optional "local" timezone name eg. BRT or CEST etc.
@@ -60,207 +64,69 @@ TIMEZONE_RE = %r{
60
64
 
61
65
 
62
66
 
67
+ ## add wday / stand-alone week day - as separate regex or
68
+ ## use TEXT with is_wday? check or such with
69
+ ## requirement of beginning of line (anchored to line) only??
70
+ ## - why? why not?
71
+
72
+ WDAY_RE = %r{
73
+ (?<wday>
74
+ \b # note - alternation (|) is lowest precedence (such
75
+ # parathenes required around \b()\b !!!
76
+ ## note - NOT case sensitive!!!
77
+ (?<day_name>
78
+ (?-i:
79
+ Mon|Mo|
80
+ Tue|Tu|
81
+ Wed|We|
82
+ Thu|Th|
83
+ Fri|Fr|
84
+ Sat|Sa|
85
+ Sun|Su
86
+ ))
87
+ \b ## todo/check - must be followed by two spaces or space + [( etc.
88
+ ## to allow words starting with weekday abbrevations - why? why not?
89
+ ## check if any names (teams, rounds, etc) come up in practice
90
+ ## or maybe remove three letter abbrevations Mon/Tue
91
+ ## and keep only Mo/Tu/We etc. - why? why not?
92
+ )}x
93
+
94
+
95
+
63
96
 
64
97
  BASICS_RE = %r{
65
98
  ## e.g. (51) or (1) etc. - limit digits of number???
99
+ ## todo/fix - change num to ord (for ordinal number)!!!!!
66
100
  (?<num> \( (?<value>\d+) \) )
67
101
  |
68
102
  (?<vs>
69
- (?<=[ ]) # Positive lookbehind for space
70
- (?:
71
- vs|v
72
- )
73
- # not bigger match first e.g. vs than v etc.
74
- # todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
103
+ (?<=[ ]) # positive lookbehind for space
104
+ (?-i:
105
+ vs|v
106
+ ) # note - only match case sensitive (downcased letters)!!!
107
+ # note - bigger match first e.g. vs than v etc.
75
108
  (?=[ ]) # positive lookahead for space
76
109
  )
77
110
  |
78
111
  (?<spaces> [ ]{2,}) |
79
112
  (?<space> [ ])
80
113
  |
81
- (?<sym>[;,@|\[\]-])
82
- }ix
83
-
84
-
85
- ## removed from basics
86
- =begin
87
- (?<none>
88
- (?<=[ \[]|^) # Positive lookbehind for space or [
89
- -
90
- (?=[ ]*;) # positive lookahead for space
91
- )
92
- |
93
- (?<vs>
94
- (?<=[ ]) # Positive lookbehind for space
95
- (?:
96
- vs\.?| ## allow optional dot (eg. vs. v.)
97
- v\.?|
98
- -
99
- ) # not bigger match first e.g. vs than v etc.
100
- (?=[ ]) # positive lookahead for space
101
- )
102
- |
103
-
104
- make - into a simple symbol !!!
105
- =end
106
-
107
-
108
- MINUTE_RE = %r{
109
- (?<minute>
110
- (?<=[ (]) # Positive lookbehind for space or opening ( e.g. (61') required
111
- (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
112
- (?: \+
113
- (?<value2>\d{1,3})
114
- )?
115
- ' ## must have minute marker!!!!
116
- )
117
- }ix
118
-
119
-
120
- ## goal types
121
- # (pen.) or (pen) or (p.) or (p)
122
- ## (o.g.) or (og)
123
- GOAL_PEN_RE = %r{
124
- (?<pen> \(
125
- (?:pen|p)\.?
126
- \)
127
- )
128
- }ix
129
- GOAL_OG_RE = %r{
130
- (?<og> \(
131
- (?:og|o\.g\.)
132
- \)
133
- )
114
+ (?<sym>[;,/@|\[\]-])
134
115
  }ix
135
116
 
136
117
 
137
118
 
138
119
 
139
-
140
-
141
- PROP_BASICS_RE = %r{
142
- (?<spaces> [ ]{2,}) |
143
- (?<space> [ ])
144
- |
145
- (?<sym>[.;,\(\)\[\]-]) ## note - dot (.) is the (all-important) end-of-prop marker!!!
146
- }ix
147
-
148
-
149
- ## name different from text (does not allow number in name/text)
150
- ##
151
- ## note - includes special handling for dot (.) if at the end of line!!!
152
- ## end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
153
-
154
- PROP_NAME_RE = %r{
155
- (?<prop_name> \b
156
- (?<name>
157
- \p{L}+
158
- (?: \. (?: (?![ ]*$) )
159
- )? ## edge case - check for end of prop marker! (e.g. Stop.)
160
- (?:
161
- [ ]? # only single spaces allowed inline!!!
162
- (?:
163
- (?:
164
- (?<=\p{L}) ## use lookbehind
165
- [/'-] ## must be surrounded by letters
166
- ## e.g. One/Two NOT
167
- ## One/ Two or One / Two or One /Two etc.
168
- (?=\p{L}) ## use lookahead
169
- )
170
- |
171
- (?:
172
- (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
173
- ['] ## must be surrounded by leading space and
174
- ## traling letters (e.g. UDI 'Beter Bed)
175
- (?=\p{L}) ## use lookahead
176
- )
177
- |
178
- (?:
179
- (?<=\p{L}) ## use lookbehind
180
- ['] ## must be surrounded by leading letter and
181
- ## trailing space PLUS letter (e.g. UDI' Beter Bed)
182
- (?=[ ]\p{L}) ## use lookahead (space WITH letter
183
- )
184
- |
185
- (?: \p{L}+
186
- (?: \.
187
- (?: (?![ ]*$) )
188
- )? ## last dot is delimiter!!!
189
- )
190
- )+
191
- )*
192
- )
193
- ## add lookahead - must be non-alphanum (or dot)
194
- (?=[ .,;\]\)]|$)
195
- )
196
- }ix
197
-
198
-
199
-
200
-
201
- ##############
202
- # add support for props/ attributes e.g.
203
- #
204
- # Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt – Andrich [Y] (46' Groß),
205
- # Kroos (80' Can) – Musiala (74' Müller), Gündogan,
206
- # Wirtz (63' Sane) – Havertz (63' Füllkrug).
207
- # Scotland: Gunn – Porteous [R 44'], Hendry, Tierney (78' McKenna) – Ralston [Y],
208
- # McTominay, McGregor (67' Gilmour), Robertson – Christie (82' Shankland),
209
- # Adams (46' Hanley), McGinn (67' McLean).
210
- #
211
- ## note: colon (:) MUST be followed by one (or more) spaces
212
- ## make sure mon feb 12 18:10 will not match
213
- ## allow 1. FC Köln etc.
214
- ## Mainz 05:
215
- ## limit to 30 chars max
216
- ## only allow chars incl. intl but (NOT ()[]/;)
217
-
218
-
219
- PROP_KEY_RE = %r{
220
- (?<prop_key> \b
221
- (?<key>
222
- (?:\p{L}+
223
- |
224
- \d+ # check for num lookahead (MUST be space or dot)
225
- ## MUST be followed by (optional dot) and
226
- ## required space !!!
227
- ## MUST be follow by a to z!!!!
228
- \.? ## optional dot
229
- [ ]? ## make space optional too - why? why not?
230
- ## yes - eg. 1st, 2nd, 5th etc.
231
- \p{L}+
232
- )
233
- [\d\p{L}'/° -]*? ## allow almost anyting
234
- ## fix - add negative lookahead
235
- ## no space and dash etc.
236
- ## only allowed "inline" not at the end
237
- ## must end with latter or digit!
238
- )
239
- [ ]*? # slurp trailing spaces
240
- :
241
- (?=[ ]+) ## possitive lookahead (must be followed by space!!)
242
- )
243
- }ix
244
-
245
-
246
-
247
-
248
- PROP_RE = Regexp.union(
249
- PROP_BASICS_RE,
250
- MINUTE_RE,
251
- PROP_NAME_RE,
252
- )
253
-
254
-
255
-
256
120
  RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
257
121
  STATUS_RE,
258
122
  TIMEZONE_RE,
259
123
  TIME_RE,
260
124
  DURATION_RE, # note - duration MUST match before date
261
125
  DATE_RE,
126
+ WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
262
127
  SCORE_RE,
263
- BASICS_RE, MINUTE_RE,
128
+ BASICS_RE,
129
+ MINUTE_RE,
264
130
  GOAL_OG_RE, GOAL_PEN_RE,
265
131
  TEXT_RE )
266
132
 
@@ -3,8 +3,8 @@ module SportDb
3
3
  module Module
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
- MINOR = 5
7
- PATCH = 8
6
+ MINOR = 6
7
+ PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -21,6 +21,8 @@ require_relative 'parser/token-score'
21
21
  require_relative 'parser/token-date'
22
22
  require_relative 'parser/token-text'
23
23
  require_relative 'parser/token-status'
24
+ require_relative 'parser/token-minute'
25
+ require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
24
26
  require_relative 'parser/token'
25
27
  require_relative 'parser/lexer'
26
28
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.8
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-25 00:00:00.000000000 Z
11
+ date: 2025-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos
@@ -102,6 +102,8 @@ files:
102
102
  - lib/sportdb/parser/racc_parser.rb
103
103
  - lib/sportdb/parser/racc_tree.rb
104
104
  - lib/sportdb/parser/token-date.rb
105
+ - lib/sportdb/parser/token-minute.rb
106
+ - lib/sportdb/parser/token-prop.rb
105
107
  - lib/sportdb/parser/token-score.rb
106
108
  - lib/sportdb/parser/token-status.rb
107
109
  - lib/sportdb/parser/token-text.rb