sportdb-parser 0.6.20 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +14 -8
- data/Rakefile +1 -1
- data/lib/sportdb/parser/blocktxt.rb +99 -0
- data/lib/sportdb/parser/lexer.rb +958 -395
- data/lib/sportdb/parser/lexer_buffer.rb +97 -0
- data/lib/sportdb/parser/lexer_tty.rb +111 -0
- data/lib/sportdb/parser/parser.rb +1768 -855
- data/lib/sportdb/parser/racc_parser.rb +1 -1
- data/lib/sportdb/parser/racc_tree.rb +327 -41
- data/lib/sportdb/parser/token-date.rb +160 -178
- data/lib/sportdb/parser/token-date_duration.rb +190 -0
- data/lib/sportdb/parser/token-geo.rb +59 -59
- data/lib/sportdb/parser/token-goals.rb +460 -0
- data/lib/sportdb/parser/token-group.rb +43 -0
- data/lib/sportdb/parser/token-note.rb +40 -0
- data/lib/sportdb/parser/token-prop.rb +70 -54
- data/lib/sportdb/parser/token-prop_name.rb +74 -0
- data/lib/sportdb/parser/token-round.rb +102 -0
- data/lib/sportdb/parser/token-score.rb +323 -47
- data/lib/sportdb/parser/token-score_fuller.rb +435 -0
- data/lib/sportdb/parser/token-score_legs.rb +59 -0
- data/lib/sportdb/parser/token-status.rb +157 -160
- data/lib/sportdb/parser/token-table.rb +149 -0
- data/lib/sportdb/parser/token-text.rb +72 -23
- data/lib/sportdb/parser/token-time.rb +141 -0
- data/lib/sportdb/parser/token.rb +242 -105
- data/lib/sportdb/parser/token_helpers.rb +92 -0
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +24 -2
- metadata +18 -18
- data/config/rounds_de.txt +0 -125
- data/config/rounds_en.txt +0 -29
- data/config/rounds_es.txt +0 -26
- data/config/rounds_misc.txt +0 -25
- data/config/rounds_pt.txt +0 -4
- data/config/zones_en.txt +0 -20
- data/lib/sportdb/parser/lang.rb +0 -298
- data/lib/sportdb/parser/token-minute.rb +0 -205
|
@@ -14,7 +14,7 @@ class Lexer
|
|
|
14
14
|
GEO_TEXT_RE = %r{
|
|
15
15
|
## must start with alpha (allow unicode letters!!)
|
|
16
16
|
(?<text>
|
|
17
|
-
|
|
17
|
+
## positive lookbehind - for now space (or beginning of line - for testing) only
|
|
18
18
|
## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
|
|
19
19
|
(?<= [ ,›>\[\]]|^)
|
|
20
20
|
(?:
|
|
@@ -40,21 +40,42 @@ GEO_TEXT_RE = %r{
|
|
|
40
40
|
## for single spaces only (and _/ MUST not be surround by spaces)
|
|
41
41
|
|
|
42
42
|
(?:
|
|
43
|
-
[ ]? # only single spaces allowed inline!!!
|
|
44
43
|
(?:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
44
|
+
[ ]? # only single (inline) space allowed - double spaces are breaks!!!
|
|
45
|
+
(?:
|
|
46
|
+
\p{L} | \d | [.&'°]
|
|
47
|
+
|
|
|
48
|
+
(?: (?<! [ ]) ## no space allowed before (but possible after)
|
|
49
|
+
[-]
|
|
50
|
+
)
|
|
51
|
+
|
|
|
52
|
+
(?: (?<! [ ]) ## no spaces allowed around these characters
|
|
53
|
+
[_/]
|
|
54
|
+
(?! [ ])
|
|
55
|
+
)
|
|
56
|
+
)+
|
|
57
|
+
)
|
|
58
|
+
|
|
|
59
|
+
## for now allow auto-add optional
|
|
60
|
+
## parenthesis enclosed closed text
|
|
61
|
+
## e.g. Dublin (Dalymount Park)
|
|
62
|
+
## Bucuresti (23 August)
|
|
63
|
+
## Paris (Parc des Princes)
|
|
64
|
+
## Ost-Berlin (Walter-Ulbricht)
|
|
65
|
+
## Athinai (OAKA - Maroussi)
|
|
66
|
+
##
|
|
67
|
+
## or Valencia (Spain) or Solna
|
|
68
|
+
(?:
|
|
69
|
+
[ ]
|
|
70
|
+
\(
|
|
71
|
+
[^()\[\],;:›<>]+ ## todo - add more special chars
|
|
72
|
+
## maybe list only allowed ones??
|
|
73
|
+
## make pattern more strict - why? why not?
|
|
74
|
+
\)
|
|
75
|
+
)
|
|
76
|
+
)*
|
|
77
|
+
|
|
78
|
+
|
|
58
79
|
## must NOT end with space or dash(-)
|
|
59
80
|
## todo/fix - possible in regex here
|
|
60
81
|
## only end in alphanum a-z0-9 (not dot or & ???)
|
|
@@ -63,56 +84,15 @@ GEO_TEXT_RE = %r{
|
|
|
63
84
|
## must be space!!!
|
|
64
85
|
## (or comma or start/end of string)
|
|
65
86
|
## kind of \b !!!
|
|
66
|
-
##
|
|
87
|
+
## POSITIVE lookahead
|
|
67
88
|
(?=[ ,›>\[\]]|$)
|
|
68
|
-
)
|
|
69
|
-
}ix
|
|
70
|
-
|
|
71
|
-
|
|
72
89
|
|
|
73
|
-
##
|
|
74
|
-
# for timezone format use for now:
|
|
75
|
-
# (BRT/UTC-3) (e.g. brazil time)
|
|
76
|
-
#
|
|
77
|
-
# (CET/UTC+1) - central european time
|
|
78
|
-
# (CEST/UTC+2) - central european summer time - daylight saving time (DST).
|
|
79
|
-
# (EET/UTC+1) - eastern european time
|
|
80
|
-
# (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
|
|
81
|
-
#
|
|
82
|
-
# UTC+3
|
|
83
|
-
# UTC+4
|
|
84
|
-
# UTC+0
|
|
85
|
-
# UTC+00
|
|
86
|
-
# UTC+0000
|
|
87
|
-
#
|
|
88
|
-
# - allow +01 or +0100 - why? why not
|
|
89
|
-
# - +0130 (01:30)
|
|
90
|
-
#
|
|
91
|
-
# see
|
|
92
|
-
# https://en.wikipedia.org/wiki/Time_zone
|
|
93
|
-
# https://en.wikipedia.org/wiki/List_of_UTC_offsets
|
|
94
|
-
# https://en.wikipedia.org/wiki/UTC−04:00 etc.
|
|
95
|
-
#
|
|
96
|
-
# e.g. (UTC-2) or (CEST/UTC-2) etc.
|
|
97
|
-
# todo check - only allow upcase
|
|
98
|
-
# or (utc-2) and (cest/utc-2) too - why? why not?
|
|
99
|
-
|
|
100
|
-
TIMEZONE_RE = %r{
|
|
101
|
-
(?<timezone>
|
|
102
|
-
\(
|
|
103
|
-
## optional "local" timezone name eg. BRT or CEST etc.
|
|
104
|
-
(?: [a-z]+
|
|
105
|
-
/
|
|
106
|
-
)?
|
|
107
|
-
[a-z]+
|
|
108
|
-
[+-]
|
|
109
|
-
\d{1,4} ## e.g. 0 or 00 or 0000
|
|
110
|
-
\)
|
|
111
90
|
)
|
|
112
91
|
}ix
|
|
113
92
|
|
|
114
93
|
|
|
115
94
|
|
|
95
|
+
|
|
116
96
|
GEO_BASICS_RE = %r{
|
|
117
97
|
(?<spaces> [ ]{2,}) |
|
|
118
98
|
(?<space> [ ])
|
|
@@ -121,10 +101,30 @@ GEO_BASICS_RE = %r{
|
|
|
121
101
|
}ix
|
|
122
102
|
|
|
123
103
|
|
|
104
|
+
## note - add "hacky" check for comma that is followed by a prop(erty)
|
|
105
|
+
##
|
|
106
|
+
## make sure to NOT match
|
|
107
|
+
## props e.g. att: 18000
|
|
108
|
+
## July 10 @ Paris, Parc des Princes, att: 18000
|
|
109
|
+
## July 10 @ Paris, Parc des Princes, att: 18000
|
|
110
|
+
##
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
GEO_END_RE = %r{
|
|
114
|
+
(?<geo_end>
|
|
115
|
+
,
|
|
116
|
+
)
|
|
117
|
+
## POSITIVE lookahead for props
|
|
118
|
+
(?=
|
|
119
|
+
[ ]* ## optional spaces
|
|
120
|
+
(?: att|ref) ## todo/fix - use generic [a-z]+ - why? why not?
|
|
121
|
+
:
|
|
122
|
+
)
|
|
123
|
+
}ix
|
|
124
124
|
|
|
125
125
|
|
|
126
126
|
GEO_RE = Regexp.union(
|
|
127
|
-
|
|
127
|
+
GEO_END_RE,
|
|
128
128
|
GEO_BASICS_RE,
|
|
129
129
|
GEO_TEXT_RE,
|
|
130
130
|
ANY_RE,
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
######################################################
|
|
6
|
+
## goal mode (switched to by PLAYER_WITH_MINUTE_RE)
|
|
7
|
+
##
|
|
8
|
+
## note - must be enclosed in ()!!!
|
|
9
|
+
## todo - add () in basics - why? why not?
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
##
|
|
14
|
+
## todo/fix - split up BASICS!!!
|
|
15
|
+
## break out SPACES_RE for general reuse!!!
|
|
16
|
+
## makes it easier to use "custom" symbols (<sym>)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
GOAL_BASICS_RE = %r{
|
|
20
|
+
(?<spaces> [ ]{2,}) |
|
|
21
|
+
(?<space> [ ])
|
|
22
|
+
|
|
|
23
|
+
(?<sym>
|
|
24
|
+
[;,)] ## add (-) dash too - why? why not?
|
|
25
|
+
)
|
|
26
|
+
}ix
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## note - assume lines starting with opening ( are goal lines!!!!
|
|
34
|
+
## note - use \A (instead of ^) - \A strictly matches the start of the string.
|
|
35
|
+
##
|
|
36
|
+
## note - check for negative lookahead
|
|
37
|
+
## to exclude ord (numbers) e.g. (1), (42), etc.!!!
|
|
38
|
+
##
|
|
39
|
+
## todo/fix -- exclude (a), (h), (n) - TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL tokens!!
|
|
40
|
+
|
|
41
|
+
START_GOAL_LINE_RE = %r{
|
|
42
|
+
\A
|
|
43
|
+
[ ]* ## ignore leading spaces (if any)
|
|
44
|
+
\(
|
|
45
|
+
|
|
46
|
+
# check NEGATIVE lookahead
|
|
47
|
+
(?!
|
|
48
|
+
## exclude (a), (h), (n)
|
|
49
|
+
## TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL
|
|
50
|
+
(?: a|h|n )
|
|
51
|
+
\)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
}xi
|
|
55
|
+
|
|
56
|
+
=begin
|
|
57
|
+
# check NEGATIVE lookahead
|
|
58
|
+
(?!
|
|
59
|
+
## exclude ord
|
|
60
|
+
(?: \d+ \))
|
|
61
|
+
|
|
|
62
|
+
## exclude score - goal_line_alt!!!
|
|
63
|
+
(?: [ ]* \b
|
|
64
|
+
\d-\d ## score e.g. 1-0
|
|
65
|
+
\b )
|
|
66
|
+
)
|
|
67
|
+
=end
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
#############
|
|
71
|
+
## check for goal compat(ility) "legacy" line
|
|
72
|
+
## e.g.
|
|
73
|
+
## (6' Puskás 0-1, 9' Czibor 0-2, 11' Morlock 1-2, 18' Rahn 2-2,
|
|
74
|
+
## 84' Rahn 3-2)
|
|
75
|
+
## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
|
|
76
|
+
## 84 Rahn 3-2)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
START_GOAL_LINE_COMPAT_RE = %r{
|
|
80
|
+
\A
|
|
81
|
+
[ ]* ## ignore leading spaces (if any)
|
|
82
|
+
\(
|
|
83
|
+
|
|
84
|
+
## (i) check NEGATIVE lookahead
|
|
85
|
+
## exclude score e.g. 1-1 etc.
|
|
86
|
+
(?! [ ]* \b \d-\d \b)
|
|
87
|
+
|
|
88
|
+
## (ii) check POSITIVE lookahead
|
|
89
|
+
(?= [ ]*
|
|
90
|
+
\d{1,3}
|
|
91
|
+
'? ## optional minute marker
|
|
92
|
+
(?: \+
|
|
93
|
+
\d{1,2}
|
|
94
|
+
'? ## optional minute marker
|
|
95
|
+
)?
|
|
96
|
+
)
|
|
97
|
+
}xi
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
###
|
|
102
|
+
## check for goal line (alternate syntax)
|
|
103
|
+
## (1-0 Player, 1-1 Player, ...)
|
|
104
|
+
# must start-off OR yes, include score
|
|
105
|
+
##
|
|
106
|
+
## note - allow "centered" style e.g.
|
|
107
|
+
## ( Player 44' (p) 1-0
|
|
108
|
+
## 1-1 Player 64' )
|
|
109
|
+
START_GOAL_LINE_ALT_RE = %r{
|
|
110
|
+
\A
|
|
111
|
+
[ ]* ## ignore leading spaces (if any)
|
|
112
|
+
\(
|
|
113
|
+
|
|
114
|
+
# check POSITIVE lookahead
|
|
115
|
+
(?= .*? ## note - non-greedy
|
|
116
|
+
\b \d-\d \b ## score e.g. 0-1
|
|
117
|
+
)
|
|
118
|
+
}xi
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
###
|
|
123
|
+
## e.g. (-; Metzger)
|
|
124
|
+
GOAL_NONE_RE = %r{ (?<goals_none>
|
|
125
|
+
-[ ]*;
|
|
126
|
+
)
|
|
127
|
+
}x
|
|
128
|
+
|
|
129
|
+
###
|
|
130
|
+
# note - alternate goal separator dash (-) MUST have leading and trailing space!!!
|
|
131
|
+
# e.g. (Metzger 83 - Krämer 29, 88, Cichy 33, Rahn 37)
|
|
132
|
+
# e.g. (Metzger - Krämer (2), Cichy, Rahn)
|
|
133
|
+
# (Brunnenmeier 17 - Gerwien 74)
|
|
134
|
+
# (Brunnenmeier - Gerwien)
|
|
135
|
+
# that is, NOT allowed
|
|
136
|
+
# e.g. (Metzger 83-Krämer 29, 88, Cichy 33, Rahn 37)
|
|
137
|
+
# (Brunnenmeier 17-Gerwien 74)
|
|
138
|
+
# (Brunnenmeier-Gerwien)
|
|
139
|
+
#
|
|
140
|
+
# note - allow split by - e.g.
|
|
141
|
+
# Frankfurt 4-2 Schalke (Kreß 45, Solz 55, Trimhold 58, Huberts 73 p -
|
|
142
|
+
# Berz 7, Herrmann 74)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
GOAL_SEP_ALT_RE = %r{
|
|
146
|
+
(?<goal_sep_alt>
|
|
147
|
+
(?<=[ ]) ## positive lookbehind - space required
|
|
148
|
+
-
|
|
149
|
+
(?=[ ]|\z) ## positive lookahead - speace required
|
|
150
|
+
)}x
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
## e.g. (2)
|
|
154
|
+
## (2/p), (2/pen.), (3/2p), (3/ 2 pen.)
|
|
155
|
+
## -or- (2,1pen), (3, 2 pens)
|
|
156
|
+
##
|
|
157
|
+
## (p), (pen.) (2 pen.), (2p)
|
|
158
|
+
## (og), (o.g.),
|
|
159
|
+
## (2og), (2 o.g.), (2ogs)
|
|
160
|
+
#
|
|
161
|
+
##
|
|
162
|
+
|
|
163
|
+
GOAL_COUNT_RE = %r{
|
|
164
|
+
(?<goal_count>
|
|
165
|
+
\(
|
|
166
|
+
(?:
|
|
167
|
+
## opt penalties
|
|
168
|
+
(?<pen>
|
|
169
|
+
(?: (?<pen_value> \d{1,2}) [ ]? )?
|
|
170
|
+
(?:pens|pen\.?|p)
|
|
171
|
+
)
|
|
172
|
+
|
|
|
173
|
+
## opt own goals (og)
|
|
174
|
+
(?<og>
|
|
175
|
+
(?: (?<og_value> \d{1,2}) [ ]? )?
|
|
176
|
+
(?:ogs?|o\.g\.|o)
|
|
177
|
+
)
|
|
178
|
+
|
|
|
179
|
+
## opt fallback - classic count/number
|
|
180
|
+
(?: (?<value> [1-9])
|
|
181
|
+
## check for option penalties
|
|
182
|
+
(?<pen>
|
|
183
|
+
[,/] [ ]*
|
|
184
|
+
(?: (?<pen_value> \d{1,2}) [ ]? )?
|
|
185
|
+
(?:pens|pen\.?|p)
|
|
186
|
+
)?
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
\)
|
|
190
|
+
)}ix
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
## minute variant for N/A not/available
|
|
198
|
+
## todo/check - find a better syntax - why? why not?
|
|
199
|
+
##
|
|
200
|
+
## note "??".to_i(10) returns 0 or
|
|
201
|
+
## "__".to_i(10) returns 0
|
|
202
|
+
## quick hack - assume 0 for n/a for now
|
|
203
|
+
|
|
204
|
+
MINUTE_NA_RE = %r{
|
|
205
|
+
(?<minute>
|
|
206
|
+
(?<=[ (]) # positive lookbehind for space or opening
|
|
207
|
+
(?<value> \?{2} | _{2} )
|
|
208
|
+
' ## must have minute marker!!!!
|
|
209
|
+
)
|
|
210
|
+
}ix
|
|
211
|
+
|
|
212
|
+
=begin
|
|
213
|
+
MINUTE_RE = %r{
|
|
214
|
+
(?<minute>
|
|
215
|
+
(?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
|
|
216
|
+
# todo - add more lookbehinds e.g. ,) etc. - why? why not?
|
|
217
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
|
218
|
+
(?: \+
|
|
219
|
+
(?<value2>\d{1,3})
|
|
220
|
+
)?
|
|
221
|
+
' ## must have minute marker!!!!
|
|
222
|
+
)
|
|
223
|
+
}ix
|
|
224
|
+
=end
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
##
|
|
228
|
+
## note - inline \b check in MINUTE_RE excludes
|
|
229
|
+
## 85pen or 90+4pen or 38p (possible and NOT excluded in GOAL_MINUTE_RE !!!)
|
|
230
|
+
##
|
|
231
|
+
## minute with optional stoppage
|
|
232
|
+
|
|
233
|
+
MINUTE_RE = %r{
|
|
234
|
+
(?<minute>
|
|
235
|
+
\b
|
|
236
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
|
237
|
+
\b
|
|
238
|
+
'? ## optional minute marker
|
|
239
|
+
|
|
240
|
+
(?: \+ (?<value2>\d{1,2})
|
|
241
|
+
\b
|
|
242
|
+
'? ## optional minute marker
|
|
243
|
+
)?
|
|
244
|
+
|
|
245
|
+
)
|
|
246
|
+
}ix
|
|
247
|
+
|
|
248
|
+
## goal types
|
|
249
|
+
# (pen.) or (pen) or (p.) or (p)
|
|
250
|
+
## (o.g.) or (og)
|
|
251
|
+
## todo/check - keep case-insensitive
|
|
252
|
+
## or allow OG or P or PEN or
|
|
253
|
+
## only lower case - why? why not?
|
|
254
|
+
##
|
|
255
|
+
## add (gg) for golden goal - why? why not?
|
|
256
|
+
## add (sg) for silver goal - why? why not??
|
|
257
|
+
|
|
258
|
+
GOAL_MINUTE_RE = %r{
|
|
259
|
+
(?<goal_minute>
|
|
260
|
+
\b
|
|
261
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
|
262
|
+
'? ## optional minute marker
|
|
263
|
+
|
|
264
|
+
(?: \+ (?<value2>\d{1,2})
|
|
265
|
+
'? ## optional minute marker
|
|
266
|
+
)?
|
|
267
|
+
|
|
268
|
+
## note - add goal minute qualifiers here inline!!!
|
|
269
|
+
(?:
|
|
270
|
+
(?: [ ]? (?<og> (?: \((?:og|o\.g\.|o)\)) ## allow (og)
|
|
271
|
+
|
|
|
272
|
+
(?: (?:og|o\.g\.|o)) ## allow plain og
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
|
|
|
276
|
+
(?: [ ]? (?<pen> (?: \((?:pen\.?|p)\)) ## allow ()
|
|
277
|
+
|
|
|
278
|
+
(?: (?:pen\.?|p))
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
|
282
|
+
## add experimental header qualifier
|
|
283
|
+
(?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
|
|
284
|
+
|
|
|
285
|
+
## add experimental free kick qualifier
|
|
286
|
+
(?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
|
|
287
|
+
)?
|
|
288
|
+
|
|
289
|
+
## add experimental seconds
|
|
290
|
+
## e.g. (95 secs) or (95sec) etc.
|
|
291
|
+
(?: [ ]* \(
|
|
292
|
+
(?<secs>\d{1,3})
|
|
293
|
+
[ ]?secs?
|
|
294
|
+
\)
|
|
295
|
+
)?
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
## note - check positive lookahead
|
|
299
|
+
(?=[ ,;)]|$)
|
|
300
|
+
}ix
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
GOAL_RE = Regexp.union(
|
|
308
|
+
GOAL_BASICS_RE,
|
|
309
|
+
GOAL_NONE_RE,
|
|
310
|
+
GOAL_MINUTE_RE,
|
|
311
|
+
GOAL_COUNT_RE,
|
|
312
|
+
## MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now
|
|
313
|
+
## GOAL_OG_RE, GOAL_PEN_RE,
|
|
314
|
+
## SCORE_RE, ## add back in v2 (level 3) or such!!
|
|
315
|
+
PROP_NAME_RE, ## note - (re)use prop name for now for (player) name
|
|
316
|
+
GOAL_SEP_ALT_RE,
|
|
317
|
+
## todo/fix - add ANY_RE !!!!
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
GOAL_TYPE_RE = %r{
|
|
323
|
+
(?<goal_type>
|
|
324
|
+
\(
|
|
325
|
+
(?:
|
|
326
|
+
(?<og> og|o\.g\.|o )
|
|
327
|
+
|
|
|
328
|
+
(?<pen> pen\.?|p )
|
|
329
|
+
|
|
|
330
|
+
## add experimental header qualifier
|
|
331
|
+
(?<hdr> hdr\.?|h )
|
|
332
|
+
|
|
|
333
|
+
## add experimental free kick qualifier
|
|
334
|
+
(?<fk> fk\.?|f )
|
|
335
|
+
)
|
|
336
|
+
\)
|
|
337
|
+
)}xi
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
GOAL_ALT_RE = Regexp.union(
|
|
342
|
+
GOAL_BASICS_RE,
|
|
343
|
+
SCORE_RE, ## e.g. 1-0, 0-1, etc.
|
|
344
|
+
GOAL_MINUTE_RE,
|
|
345
|
+
GOAL_TYPE_RE,
|
|
346
|
+
PROP_NAME_RE, ## note - (re)use prop name for now for (player) name
|
|
347
|
+
## todo/fix - add ANY_RE !!!!
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
GOAL_COMPAT_RE = Regexp.union(
|
|
351
|
+
GOAL_BASICS_RE,
|
|
352
|
+
SCORE_RE, ## e.g. 1-0, 0-1, etc.
|
|
353
|
+
MINUTE_RE, ## note - matches minute e.g. 92, 7, 7' 7+3, 46+, etc.
|
|
354
|
+
GOAL_TYPE_RE,
|
|
355
|
+
PROP_NAME_RE, ## note - (re)use prop name for now for (player) name
|
|
356
|
+
## todo/fix - add ANY_RE !!!!
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
=begin
|
|
360
|
+
## note - leave out n/a minute in goals - make minutes optional!!!
|
|
361
|
+
PROP_GOAL_RE = Regexp.union(
|
|
362
|
+
GOAL_BASICS_RE,
|
|
363
|
+
MINUTE_RE,
|
|
364
|
+
## MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now
|
|
365
|
+
GOAL_OG_RE, GOAL_PEN_RE,
|
|
366
|
+
SCORE_RE,
|
|
367
|
+
PROP_NAME_RE, ## note - (re)use prop name for now for (player) name
|
|
368
|
+
)
|
|
369
|
+
=end
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def self._parse_goal_minute( str )
|
|
375
|
+
## note - strip - leading/trailing spaces
|
|
376
|
+
m = GOAL_MINUTE_RE.match( str.strip )
|
|
377
|
+
if m && m.pre_match == '' && m.post_match == ''
|
|
378
|
+
_build_goal_minute( m )
|
|
379
|
+
elsif m
|
|
380
|
+
## note - match BUT not anchored to start and end-of-string!!!
|
|
381
|
+
## report, error somehow??
|
|
382
|
+
nil
|
|
383
|
+
else
|
|
384
|
+
nil ## no match - return nil
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def self._build_goal_minute( m )
|
|
390
|
+
minute = {}
|
|
391
|
+
|
|
392
|
+
minute[:m] = m[:value].to_i(10) ## always required
|
|
393
|
+
|
|
394
|
+
## stoppage/injury time (offset)
|
|
395
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
396
|
+
|
|
397
|
+
minute[:og] = true if m[:og]
|
|
398
|
+
minute[:pen] = true if m[:pen]
|
|
399
|
+
minute[:freekick] = true if m[:fk]
|
|
400
|
+
minute[:header] = true if m[:hdr]
|
|
401
|
+
|
|
402
|
+
minute[:secs] = m[:secs].to_i(10) if m[:secs]
|
|
403
|
+
|
|
404
|
+
minute
|
|
405
|
+
end
|
|
406
|
+
def _build_goal_minute( m ) self.class._build_goal_minute( m ); end
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def self._build_minute( m )
|
|
410
|
+
minute = {}
|
|
411
|
+
minute[:m] = m[:value].to_i(10) ## always required
|
|
412
|
+
|
|
413
|
+
## stoppage/injury time (offset)
|
|
414
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
415
|
+
|
|
416
|
+
minute
|
|
417
|
+
end
|
|
418
|
+
def _build_minute( m ) self.class._build_minute( m ); end
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def self._parse_goal_count( str )
|
|
423
|
+
## note - strip - leading/trailing spaces
|
|
424
|
+
m = GOAL_COUNT_RE.match( str.strip )
|
|
425
|
+
if m && m.pre_match == '' && m.post_match == ''
|
|
426
|
+
_build_goal_count( m )
|
|
427
|
+
elsif m
|
|
428
|
+
## note - match BUT not anchored to start and end-of-string!!!
|
|
429
|
+
## report, error somehow??
|
|
430
|
+
nil
|
|
431
|
+
else
|
|
432
|
+
nil ## no match - return nil
|
|
433
|
+
end
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
def self._build_goal_count( m )
|
|
437
|
+
count = {}
|
|
438
|
+
count[:count] = m[:value].to_i(10) if m[:value]
|
|
439
|
+
count[:og] = m[:og_value] ? m[:og_value].to_i(10) : 1 if m[:og] ## check flag
|
|
440
|
+
count[:pen] = m[:pen_value] ? m[:pen_value].to_i(10) : 1 if m[:pen] ## check flag
|
|
441
|
+
count
|
|
442
|
+
end
|
|
443
|
+
def _build_goal_count( m ) self.class._build_goal_count( m ); end
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def self._build_goal_type( m )
|
|
449
|
+
goal = {}
|
|
450
|
+
goal[:og] = true if m[:og]
|
|
451
|
+
goal[:pen] = true if m[:pen]
|
|
452
|
+
goal[:freekick] = true if m[:fk]
|
|
453
|
+
goal[:header] = true if m[:hdr]
|
|
454
|
+
goal
|
|
455
|
+
end
|
|
456
|
+
def _build_goal_type( m ) self.class._build_goal_type( m ); end
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
end # class Lexer
|
|
460
|
+
end # module SportDb
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
###
|
|
6
|
+
# check for start of group def line e.g.
|
|
7
|
+
# Group A | ...
|
|
8
|
+
# Group 1 : ....
|
|
9
|
+
# Group A2 | ....
|
|
10
|
+
## note - use \A (instead of ^) - \A strictly matches the start of the string.
|
|
11
|
+
GROUP_DEF_LINE_RE = %r{ \A
|
|
12
|
+
[ ]* ## ignore leading spaces (if any)
|
|
13
|
+
(?<group_def>
|
|
14
|
+
Group
|
|
15
|
+
[ ]
|
|
16
|
+
[a-z0-9]+ ## todo/check - allow dot (.) too e.g. 1.A etc.- why? why not?
|
|
17
|
+
)
|
|
18
|
+
### possitive lookahead MUST be : OR |
|
|
19
|
+
(?= [ ]*
|
|
20
|
+
[:|]
|
|
21
|
+
[ ]) ## note: requires space for now after [:|] - keep - why? why not?
|
|
22
|
+
}ix
|
|
23
|
+
|
|
24
|
+
GROUP_DEF_BASICS_RE = %r{
|
|
25
|
+
(?<spaces> [ ]{2,}) |
|
|
26
|
+
(?<space> [ ])
|
|
27
|
+
|
|
|
28
|
+
(?<sym> [:|,] ) ### note - add comma (,) as optional separator
|
|
29
|
+
}ix
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
GROUP_DEF_RE = Regexp.union( GROUP_DEF_BASICS_RE,
|
|
33
|
+
TEXT_RE,
|
|
34
|
+
ANY_RE,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
end # class Lexer
|
|
43
|
+
end # module SportDb
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
### fix - use (?<text>) - text capture for inner text!!
|
|
6
|
+
## use (?<note> for complete match as a convention!! )
|
|
7
|
+
NOTE_RE = %r{
|
|
8
|
+
\[
|
|
9
|
+
(?<note>
|
|
10
|
+
[^\[\]\#]*? ## note - non-greedy/lazy operator
|
|
11
|
+
## exclude comments inside note block - why? why not?
|
|
12
|
+
)
|
|
13
|
+
\]
|
|
14
|
+
}xi
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
####
|
|
18
|
+
## fix - change NOTE_RE to MATCH_NOTE_RE !!!!
|
|
19
|
+
## and change NOTA_BENE_RE to NOTE_RE !!!
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
## check for "literal" (multi-line) note blocks
|
|
24
|
+
## eg. nb: or note:
|
|
25
|
+
## space required after double colon - why? why not?
|
|
26
|
+
##
|
|
27
|
+
## note - use \A (instead of ^) - \A strictly matches the start of the string.
|
|
28
|
+
NOTA_BENE_RE = %r{ \A
|
|
29
|
+
[ ]* ## ignore leading spaces (if any)
|
|
30
|
+
(?: nb | note) [ ]* : [ ]+
|
|
31
|
+
(?<nota_bene>
|
|
32
|
+
.+? ## use non-greedy
|
|
33
|
+
)
|
|
34
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
35
|
+
\z
|
|
36
|
+
}xi
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
end # class Lexer
|
|
40
|
+
end # module SportDb
|