sportdb-parser 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +17 -4
- data/lib/sportdb/parser/lexer-on_goal.rb +172 -0
- data/lib/sportdb/parser/lexer-on_group_def.rb +31 -0
- data/lib/sportdb/parser/lexer-on_prop_lineup.rb +79 -0
- data/lib/sportdb/parser/lexer-on_prop_misc.rb +110 -0
- data/lib/sportdb/parser/lexer-on_prop_penalties.rb +40 -0
- data/lib/sportdb/parser/lexer-on_round_def.rb +37 -0
- data/lib/sportdb/parser/lexer-on_top.rb +125 -0
- data/lib/sportdb/parser/lexer-prep_doc.rb +131 -0
- data/lib/sportdb/parser/lexer-prep_line.rb +63 -0
- data/lib/sportdb/parser/lexer-tokenize.rb +449 -0
- data/lib/sportdb/parser/lexer.rb +133 -1363
- data/lib/sportdb/parser/lexer_buffer.rb +8 -37
- data/lib/sportdb/parser/lexer_token.rb +126 -0
- data/lib/sportdb/parser/parser.rb +1104 -1403
- data/lib/sportdb/parser/racc_parser.rb +36 -32
- data/lib/sportdb/parser/racc_tree.rb +65 -98
- data/lib/sportdb/parser/token-date--helpers.rb +130 -0
- data/lib/sportdb/parser/token-date--names.rb +108 -0
- data/lib/sportdb/parser/token-date.rb +20 -192
- data/lib/sportdb/parser/token-date_duration.rb +8 -27
- data/lib/sportdb/parser/token-geo.rb +16 -16
- data/lib/sportdb/parser/token-goals--helpers.rb +114 -0
- data/lib/sportdb/parser/token-goals.rb +103 -249
- data/lib/sportdb/parser/token-group.rb +8 -22
- data/lib/sportdb/parser/token-prop.rb +138 -124
- data/lib/sportdb/parser/token-prop_name.rb +48 -39
- data/lib/sportdb/parser/token-round.rb +21 -35
- data/lib/sportdb/parser/token-score--helpers.rb +189 -0
- data/lib/sportdb/parser/token-score.rb +9 -393
- data/lib/sportdb/parser/token-score_full.rb +331 -0
- data/lib/sportdb/parser/token-status.rb +44 -46
- data/lib/sportdb/parser/token-status_inline.rb +112 -0
- data/lib/sportdb/parser/token-text.rb +41 -31
- data/lib/sportdb/parser/token-time.rb +29 -26
- data/lib/sportdb/parser/token.rb +58 -159
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +45 -17
- metadata +19 -6
- data/lib/sportdb/parser/blocktxt.rb +0 -99
- data/lib/sportdb/parser/lexer_tty.rb +0 -111
- data/lib/sportdb/parser/token-table.rb +0 -149
- data/lib/sportdb/parser/token_helpers.rb +0 -92
|
@@ -7,23 +7,12 @@ class Lexer
|
|
|
7
7
|
## do NOT use (anymore) as generic TEXT_RE
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
## todo - use ANY_RE to token_commons or such - for shared by many?
|
|
12
|
-
|
|
13
|
-
## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
|
|
14
|
-
## to avoid advance of pos match!!!
|
|
15
|
-
ANY_RE = %r{
|
|
16
|
-
(?<any> .)
|
|
17
|
-
}ix
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
10
|
## note - TEXT_RE used for TEAM_NAMES
|
|
22
11
|
## plus as "legacy" shortcut for (simple) group or round names e.g.
|
|
23
12
|
## Group A, Group 1, ..
|
|
24
13
|
## Matchday 1, 1. Round,
|
|
25
|
-
## note - no exception for (shortcut) group or round (MUST match team name pattern!)
|
|
26
|
-
|
|
14
|
+
## note - no exception for (shortcut) group or round (MUST match team name pattern!)
|
|
15
|
+
|
|
27
16
|
|
|
28
17
|
|
|
29
18
|
## note - do NOT allow single alpha text for now
|
|
@@ -39,7 +28,7 @@ ANY_RE = %r{
|
|
|
39
28
|
# 1 FC ## allow 1-FC or 1FC - why? why not?
|
|
40
29
|
# 1FC"
|
|
41
30
|
# 1. FC
|
|
42
|
-
# 1.FC
|
|
31
|
+
# 1.FC
|
|
43
32
|
# 23° Noviembre
|
|
44
33
|
# 1890 Munich
|
|
45
34
|
# 1-FC - XXXX - not allowed for now, parse error
|
|
@@ -76,7 +65,7 @@ TEXT_RE = %r{
|
|
|
76
65
|
## MUST be followed by (optional dot) and
|
|
77
66
|
## required space !!!
|
|
78
67
|
## MUST be follow by a to z!!!!
|
|
79
|
-
[.°]? ## optional dot (.) or degree(°) - todo - add number sign too!!
|
|
68
|
+
[.°]? ## optional dot (.) or degree(°) - todo - add number sign too!!
|
|
80
69
|
[ ]? ## make space optional too - why? why not?
|
|
81
70
|
## yes - eg. 1st, 2nd, 5th etc.
|
|
82
71
|
\p{L}+
|
|
@@ -91,10 +80,10 @@ TEXT_RE = %r{
|
|
|
91
80
|
## note - exclude (v[ ]/vs[ ]/vs.[ ])
|
|
92
81
|
## AND switch to case-sensitive (via -i!!!)
|
|
93
82
|
(?! (?-i: (?: ## note - (big) V not matching for versus!!!
|
|
94
|
-
vs\.?|v|
|
|
95
|
-
|
|
96
|
-
n/p|N/P|
|
|
97
|
-
w/o|W/O|
|
|
83
|
+
vs\.?|v|
|
|
84
|
+
|
|
85
|
+
n/p|N/P|
|
|
86
|
+
w/o|W/O|
|
|
98
87
|
abd\.?|ABD|
|
|
99
88
|
aban\.?|ABAN|
|
|
100
89
|
susp\.?|SUSP|
|
|
@@ -103,20 +92,20 @@ TEXT_RE = %r{
|
|
|
103
92
|
po?stp\.?|PO?STP|P-P|
|
|
104
93
|
x-x|X-X|
|
|
105
94
|
awd\.?|AWD|
|
|
106
|
-
canc\.?|CANC ) [ ]
|
|
95
|
+
canc\.?|CANC ) [ ]
|
|
107
96
|
|
|
|
108
97
|
(?: bye|BYE ) (?:[ ]|$))
|
|
109
|
-
)
|
|
98
|
+
)
|
|
110
99
|
)
|
|
111
|
-
|
|
|
112
|
-
[/-] ## must NOT be surrounded by spaces
|
|
100
|
+
|
|
|
101
|
+
[/-] ## must NOT be surrounded by spaces
|
|
113
102
|
)?
|
|
114
103
|
(?:
|
|
115
|
-
\p{L}
|
|
104
|
+
\p{L}
|
|
116
105
|
|
|
|
117
106
|
(?: ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.)
|
|
118
107
|
\. (?! \.) ## allow single points only (now two or more etc.)
|
|
119
|
-
|
|
|
108
|
+
|
|
|
120
109
|
& (?! &)
|
|
121
110
|
|
|
|
122
111
|
' (?! ')
|
|
@@ -126,11 +115,11 @@ TEXT_RE = %r{
|
|
|
126
115
|
\d+
|
|
127
116
|
(?!
|
|
128
117
|
[0-9h'+] | ## protected break on 12h / 12' / 1-1
|
|
129
|
-
## check usege for 3+4 - possible? where ? why?
|
|
118
|
+
## check usege for 3+4 - possible? where ? why?
|
|
130
119
|
(?:[.:-]\d) ## protected/exclude/break on 12.03 / 12:03 / 12-12
|
|
131
120
|
## BUT allow Park21-Arena for example e.g. 21-A :-)
|
|
132
121
|
)
|
|
133
|
-
[°]? ## followed by optional ord
|
|
122
|
+
[°]? ## followed by optional ord
|
|
134
123
|
## negative lookahead for numbers
|
|
135
124
|
## note - include digits itself!!!
|
|
136
125
|
## note - remove / (slash) e.g. allows UDI'19/Beter Bed
|
|
@@ -176,15 +165,15 @@ TEXT_RE = %r{
|
|
|
176
165
|
# e.g. (AUT) or ,AUT or AUT
|
|
177
166
|
(?:
|
|
178
167
|
[ ] ## note - do NOT allow more than one space!!! - why? why not?
|
|
179
|
-
\(
|
|
168
|
+
\(
|
|
180
169
|
## note - auto-exclude reserved (aet) from SCORE_FULLER_MORE!!!
|
|
181
170
|
## plus golden goal (gg)/sudden death (sd), silver goal (sg)
|
|
182
|
-
## (ht), (ft)
|
|
171
|
+
## (ht), (ft)
|
|
183
172
|
(?! (?: aet | agget | asdet | asget | ht | ft )
|
|
184
173
|
\)
|
|
185
|
-
)
|
|
174
|
+
)
|
|
186
175
|
(?:
|
|
187
|
-
[A-Z]{1,5}
|
|
176
|
+
[A-Z]{1,5}
|
|
188
177
|
)
|
|
189
178
|
\)
|
|
190
179
|
)
|
|
@@ -207,5 +196,26 @@ TEXT_RE = %r{
|
|
|
207
196
|
}ix
|
|
208
197
|
|
|
209
198
|
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
###
|
|
203
|
+
## helper for testing regex match for team names
|
|
204
|
+
|
|
205
|
+
def self._parse_team( str )
|
|
206
|
+
## note - strip - leading/trailing spaces
|
|
207
|
+
m = TEXT_RE.match( str.strip )
|
|
208
|
+
if m && m.pre_match == '' && m.post_match == ''
|
|
209
|
+
m
|
|
210
|
+
elsif m
|
|
211
|
+
## note - match BUT not anchored to start and end-of-string!!!
|
|
212
|
+
## report, error somehow??
|
|
213
|
+
nil
|
|
214
|
+
else
|
|
215
|
+
nil ## no match - return nil
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
|
|
210
220
|
end # class Lexer
|
|
211
221
|
end # module SportDb
|
|
@@ -12,7 +12,7 @@ class Lexer
|
|
|
12
12
|
# note - optional timezone possible e.g.
|
|
13
13
|
# 18:30 UTC+1 or 18:30 BST/UTC+1 or such!!!
|
|
14
14
|
# 18:30 UTC+01 or 18:30 BST/UTC+01
|
|
15
|
-
#
|
|
15
|
+
#
|
|
16
16
|
#
|
|
17
17
|
# note 18.30 no longer supported - MUST use 18:30 or 18h30 !!!
|
|
18
18
|
#
|
|
@@ -22,31 +22,31 @@ class Lexer
|
|
|
22
22
|
# and, thus, must always follow time
|
|
23
23
|
# e.g. 18:30 (19:30 BST)
|
|
24
24
|
#
|
|
25
|
-
## local time e.g (19:30 UTC+1) or (19:30 BST/UTC+1) or
|
|
25
|
+
## local time e.g (19:30 UTC+1) or (19:30 BST/UTC+1) or
|
|
26
26
|
## note - timezone is optional! e.g. (19:30) works too
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
TIME_RE = %r{
|
|
30
30
|
\b
|
|
31
|
-
(?<time>
|
|
31
|
+
(?<time>
|
|
32
32
|
(?<hour>\d{1,2})
|
|
33
|
-
[:h]
|
|
33
|
+
[:h]
|
|
34
34
|
(?<minute>\d{2})
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
#### optional (inline) timezone
|
|
37
37
|
## note - non-utc timezone MUST be hard-coded (added) here!!!
|
|
38
38
|
## avoids eating-up team names (separated by one space)
|
|
39
|
-
## e.g. 18:30 MEX v MEX
|
|
39
|
+
## e.g. 18:30 MEX v MEX
|
|
40
40
|
(?:
|
|
41
41
|
[ ] ## require space - why? why not
|
|
42
42
|
(?<timezone>
|
|
43
|
-
(?:
|
|
43
|
+
(?:
|
|
44
44
|
## GMT - Greenwich Mean Time
|
|
45
45
|
## BST - British Summer Time
|
|
46
46
|
## CES?T - Central European (Summer) Time
|
|
47
47
|
## EES?T - Eastern European (Summer) Time
|
|
48
48
|
##
|
|
49
|
-
(?: GMT|BST|CES?T|EES?T)
|
|
49
|
+
(?: GMT|BST|CES?T|EES?T)
|
|
50
50
|
(?: /
|
|
51
51
|
UTC (?: [+-]\d{1,4} | ±0)
|
|
52
52
|
)?
|
|
@@ -57,19 +57,20 @@ TIME_RE = %r{
|
|
|
57
57
|
)
|
|
58
58
|
)
|
|
59
59
|
)?
|
|
60
|
-
)
|
|
61
|
-
\b
|
|
60
|
+
)
|
|
61
|
+
\b
|
|
62
62
|
|
|
63
63
|
####
|
|
64
64
|
### note - local time is now INLINE and MUST follow time
|
|
65
|
-
(?:
|
|
65
|
+
(?:
|
|
66
66
|
[ ]+ ## todo/check - make space optional - why? why not?
|
|
67
67
|
\(
|
|
68
|
-
(?<time_local>
|
|
68
|
+
(?<time_local>
|
|
69
69
|
(?<local_hour>\d{1,2})
|
|
70
70
|
[:h] ### todo/fix - MUST match style in time above!!!
|
|
71
|
+
### use capture with backref!!!!
|
|
71
72
|
(?<local_minute>\d{2})
|
|
72
|
-
|
|
73
|
+
|
|
73
74
|
####
|
|
74
75
|
## optional "local" timezone name eg. BRT or CEST etc.
|
|
75
76
|
(?:
|
|
@@ -78,16 +79,16 @@ TIME_RE = %r{
|
|
|
78
79
|
(?: [A-Z]{3,4}
|
|
79
80
|
(?: /
|
|
80
81
|
UTC (?: [+-]\d{1,4} | ±0)
|
|
81
|
-
)?
|
|
82
|
+
)?
|
|
82
83
|
)
|
|
83
|
-
|
|
|
84
|
+
|
|
|
84
85
|
(?: ## e.g. 0 or 00 or 0000
|
|
85
86
|
UTC (?: [+-]\d{1,4} | ±0)
|
|
86
|
-
)
|
|
87
|
+
)
|
|
87
88
|
)
|
|
88
89
|
)? # note - make timezone optional!!!
|
|
89
90
|
)
|
|
90
|
-
\)
|
|
91
|
+
\)
|
|
91
92
|
)?
|
|
92
93
|
}ix
|
|
93
94
|
|
|
@@ -98,20 +99,20 @@ def self._build_time( m )
|
|
|
98
99
|
## 12h40 => 12:40 etc.
|
|
99
100
|
## keep string (no time-only type in ruby)
|
|
100
101
|
data = { time: {} }
|
|
101
|
-
|
|
102
|
+
|
|
102
103
|
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
|
103
104
|
minute = m[:minute].to_i(10)
|
|
104
|
-
|
|
105
|
+
|
|
105
106
|
## check if 24:00 possible? or only 0:00 (23:59)
|
|
106
107
|
unless (hour >=0 && hour <=23) &&
|
|
107
108
|
(minute >=0 && minute <=59)
|
|
108
109
|
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
|
109
110
|
end
|
|
110
|
-
|
|
111
|
+
|
|
111
112
|
data[:time][:h] = hour
|
|
112
113
|
data[:time][:m] = minute
|
|
113
|
-
data[:time][:timezone] = m[:timezone] if m[:timezone]
|
|
114
|
-
|
|
114
|
+
data[:time][:timezone] = m[:timezone] if m[:timezone]
|
|
115
|
+
|
|
115
116
|
|
|
116
117
|
## check if local time present e.g.
|
|
117
118
|
## 18:30 (19:30)
|
|
@@ -121,21 +122,23 @@ def self._build_time( m )
|
|
|
121
122
|
|
|
122
123
|
local_hour = m[:local_hour].to_i(10) ## allow 08/07/etc.
|
|
123
124
|
local_minute = m[:local_minute].to_i(10)
|
|
124
|
-
|
|
125
|
+
|
|
125
126
|
## check if 24:00 possible? or only 0:00 (23:59)
|
|
126
127
|
unless (hour >=0 && hour <=23) &&
|
|
127
128
|
(minute >=0 && minute <=59)
|
|
128
129
|
raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range"
|
|
129
130
|
end
|
|
130
|
-
|
|
131
|
+
|
|
131
132
|
data[:time_local][:h] = local_hour
|
|
132
133
|
data[:time_local][:m] = local_minute
|
|
133
|
-
data[:time_local][:timezone] = m[:local_timezone] if m[:local_timezone]
|
|
134
|
-
|
|
134
|
+
data[:time_local][:timezone] = m[:local_timezone] if m[:local_timezone]
|
|
135
|
+
end
|
|
135
136
|
|
|
136
137
|
data
|
|
137
138
|
end
|
|
138
139
|
def _build_time(m) self.class._build_time(m); end
|
|
139
140
|
|
|
141
|
+
|
|
142
|
+
|
|
140
143
|
end # class Lexer
|
|
141
144
|
end # module SportDb
|
data/lib/sportdb/parser/token.rb
CHANGED
|
@@ -5,25 +5,6 @@ class Lexer
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
BASICS_RE = %r{
|
|
9
|
-
(?<vs>
|
|
10
|
-
(?<=[ ]) # positive lookbehind for space
|
|
11
|
-
(?-i:
|
|
12
|
-
vs\.?|v|VS
|
|
13
|
-
) # note - only match case sensitive (downcased letters)!!!
|
|
14
|
-
# note - bigger match first e.g. vs than v etc.
|
|
15
|
-
(?=[ ]) # positive lookahead for space
|
|
16
|
-
)
|
|
17
|
-
|
|
|
18
|
-
(?<spaces> [ ]{2,}) |
|
|
19
|
-
(?<space> [ ])
|
|
20
|
-
|
|
|
21
|
-
(?<sym> [,;/@|()\[\]-] ) ### note: add parantheses too e.g () - why? why not?
|
|
22
|
-
}ix
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
8
|
###
|
|
28
9
|
## add att(endance) e.g. att: 18000
|
|
29
10
|
##
|
|
@@ -32,7 +13,8 @@ BASICS_RE = %r{
|
|
|
32
13
|
ATTENDANCE_RE = %r{
|
|
33
14
|
(?<attendance>
|
|
34
15
|
\b
|
|
35
|
-
att
|
|
16
|
+
(?: attendance|att )
|
|
17
|
+
: [ ]*
|
|
36
18
|
(?<value>
|
|
37
19
|
[1-9]
|
|
38
20
|
(?: _? \d+ )*
|
|
@@ -41,125 +23,51 @@ ATTENDANCE_RE = %r{
|
|
|
41
23
|
)}ix
|
|
42
24
|
|
|
43
25
|
|
|
44
|
-
## "inline" match status e.g.
|
|
45
|
-
## Clapham Rovers w/o Hitchin
|
|
46
|
-
## Queen's Park bye
|
|
47
|
-
|
|
48
|
-
## add support for WO or W-0 too - why? why not?
|
|
49
|
-
INLINE_WO_RE = %r{
|
|
50
|
-
(?<inline_wo>
|
|
51
|
-
\b (?: w/o | W/O ) \b
|
|
52
|
-
)}x ## note - NOT case insensitive
|
|
53
|
-
|
|
54
|
-
INLINE_BYE_RE = %r{
|
|
55
|
-
(?<inline_bye>
|
|
56
|
-
\b (?: bye | BYE ) \b
|
|
57
|
-
)}x ## note - NOT case insensitive
|
|
58
26
|
|
|
59
27
|
|
|
60
28
|
###
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
(?<inline_np>
|
|
65
|
-
\b (?: n/p | N/P ) \b
|
|
66
|
-
)}x ## note - NOT case insensitive
|
|
29
|
+
## home/away/neutral - (h), (a), (n)
|
|
30
|
+
## add support for h/a/n
|
|
31
|
+
## with (?-i \b [han] \b) lower-case and \b boundry - why? why not?
|
|
67
32
|
|
|
33
|
+
TEAM_HOME_RE = %r{ (?<team_home> \(h\) )}ix
|
|
34
|
+
TEAM_AWAY_RE = %r{ (?<team_away> \(a\) )}ix
|
|
35
|
+
TEAM_NEUTRAL_RE = %r{ (?<team_neutral> \(n\) )}ix
|
|
68
36
|
|
|
69
|
-
###
|
|
70
|
-
# abd/abd. or aban/aban. [abandoned]
|
|
71
|
-
# ABD/ABAN
|
|
72
|
-
INLINE_ABD_RE = %r{
|
|
73
|
-
(?<inline_abd>
|
|
74
|
-
\b (?: abd\.? |
|
|
75
|
-
aban\.? |
|
|
76
|
-
ABD | ABAN
|
|
77
|
-
)
|
|
78
|
-
## POSITIVE lookahead - requires space
|
|
79
|
-
(?= [ ])
|
|
80
|
-
)}x ## note - NOT case insensitive
|
|
81
|
-
|
|
82
|
-
####
|
|
83
|
-
# susp/susp. [suspended]
|
|
84
|
-
# SUSP
|
|
85
|
-
INLINE_SUSP_RE = %r{
|
|
86
|
-
(?<inline_susp>
|
|
87
|
-
\b (?: susp\.? |
|
|
88
|
-
SUSP )
|
|
89
|
-
## POSITIVE lookahead - requires space
|
|
90
|
-
(?= [ ])
|
|
91
|
-
)}x ## note - NOT case insensitive
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
####
|
|
95
|
-
# ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed]
|
|
96
|
-
# PPD/PSTP/POSTP/P-P
|
|
97
|
-
# todo/check - add/allow p-p too - why? why not?
|
|
98
|
-
INLINE_PPD_RE = %r{
|
|
99
|
-
(?<inline_ppd>
|
|
100
|
-
\b (?: ppd\.? |
|
|
101
|
-
pst\.? |
|
|
102
|
-
po?stp\.? |
|
|
103
|
-
PPD | PST | PO?STP | P-P
|
|
104
|
-
)
|
|
105
|
-
## POSITIVE lookahead - requires space
|
|
106
|
-
(?= [ ])
|
|
107
|
-
)}x ## note - NOT case insensitive
|
|
108
|
-
|
|
109
|
-
####
|
|
110
|
-
# void via x-x X-X
|
|
111
|
-
# todo/check - only allow X-X - why? why not?
|
|
112
|
-
INLINE_VOID_RE = %r{
|
|
113
|
-
(?<inline_void>
|
|
114
|
-
\b (?: x-x |
|
|
115
|
-
X-X
|
|
116
|
-
)
|
|
117
|
-
## POSITIVE lookahead - requires space
|
|
118
|
-
(?= [ ])
|
|
119
|
-
)}x ## note - NOT case insensitive
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
####
|
|
123
|
-
# awd/awd. [awarded]
|
|
124
|
-
# AWD
|
|
125
|
-
# note - recommendation is to allways include score
|
|
126
|
-
# thus, use/prefer SCORE_AWD e.g. 0-3 awd
|
|
127
|
-
INLINE_AWD_RE = %r{
|
|
128
|
-
(?<inline_awd>
|
|
129
|
-
\b (?: awd\.? | AWD )
|
|
130
|
-
## POSITIVE lookahead - requires space
|
|
131
|
-
(?= [ ])
|
|
132
|
-
)}x ## note - NOT case insensitive
|
|
133
37
|
|
|
134
|
-
###
|
|
135
|
-
# canc/canc. [cancelled]
|
|
136
|
-
# CANC
|
|
137
|
-
INLINE_CANC_RE = %r{
|
|
138
|
-
(?<inline_canc>
|
|
139
|
-
\b (?: canc\.? | CANC )
|
|
140
|
-
## POSITIVE lookahead - requires space
|
|
141
|
-
(?= [ ])
|
|
142
|
-
)}x ## note - NOT case insensitive
|
|
143
38
|
|
|
144
39
|
|
|
145
|
-
###
|
|
146
|
-
## home/away/neutral - (h), (a), (n)
|
|
147
|
-
## add support for h/a/n
|
|
148
|
-
## with (?-i \b [han] \b) lower-case and \b boundry - why? why not?
|
|
149
40
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
41
|
+
##
|
|
42
|
+
## note VS
|
|
43
|
+
## remove VS for now
|
|
44
|
+
## e.g. Olympia Wijgmaal v VS Kortenaken
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
## note - only match case sensitive (downcased letters)!!!
|
|
49
|
+
## note - bigger match first e.g. vs than v etc.
|
|
50
|
+
VS_RE = %r{
|
|
51
|
+
(?<vs>
|
|
52
|
+
(?<=[ ]) # positive lookBEHIND for space
|
|
53
|
+
(?-i:
|
|
54
|
+
vs\.?|v
|
|
55
|
+
)
|
|
56
|
+
(?=[ ]) # positive lookAHEAD for space
|
|
57
|
+
)
|
|
58
|
+
}ix
|
|
59
|
+
|
|
153
60
|
|
|
154
61
|
|
|
155
62
|
|
|
63
|
+
##############
|
|
156
64
|
## "top-level" regex used for:
|
|
157
65
|
## - date_header
|
|
158
66
|
## - match_header & match_line_more
|
|
159
67
|
## - match_line
|
|
160
68
|
|
|
161
|
-
|
|
162
69
|
RE = Regexp.union(
|
|
70
|
+
SPACES_RE,
|
|
163
71
|
STATUS_RE, ## match status e.g. [cancelled], etc.
|
|
164
72
|
|
|
165
73
|
INLINE_WO_RE, ## (inline) match status - w/o (walkout)
|
|
@@ -167,11 +75,11 @@ RE = Regexp.union(
|
|
|
167
75
|
INLINE_BYE_RE, ## (inline) match status - bye (advance to next round)
|
|
168
76
|
INLINE_ABD_RE, ## (inline) match status - abd/abd. (abandoned)
|
|
169
77
|
INLINE_SUSP_RE, ## (inline) match status - susp/susp. (suspended)
|
|
170
|
-
INLINE_PPD_RE, ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)
|
|
171
|
-
INLINE_VOID_RE, ## (inline) match status - x-x (voided)
|
|
78
|
+
INLINE_PPD_RE, ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)
|
|
79
|
+
INLINE_VOID_RE, ## (inline) match status - x-x (voided)
|
|
172
80
|
INLINE_AWD_RE, ## (inline) match status - awd/awd. (awarded)
|
|
173
81
|
INLINE_CANC_RE, ## (inline) match status - canc/canc. (cancelled/canceled)
|
|
174
|
-
|
|
82
|
+
|
|
175
83
|
|
|
176
84
|
TEAM_HOME_RE, ## (H)
|
|
177
85
|
TEAM_AWAY_RE, ## (A)
|
|
@@ -180,26 +88,25 @@ RE = Regexp.union(
|
|
|
180
88
|
NOTE_RE, ### fix - change to INLINE_NOTE !!!
|
|
181
89
|
DATE_LEGS_RE, # note - must go before date!!!
|
|
182
90
|
DATE_RE, ## note - date must go before time (e.g. 12.12. vs 12.12)
|
|
183
|
-
|
|
184
|
-
|
|
91
|
+
TIME_RE,
|
|
92
|
+
|
|
93
|
+
ATTENDANCE_RE, # note - allow att: for now inline in matches too - why? why not?
|
|
94
|
+
|
|
95
|
+
SCORE_FULL_1ST_RE, # note - MUST go before SCORE_LEGS_RE!!
|
|
96
|
+
## e.g. 2-2, 5-1 pen.
|
|
185
97
|
SCORE_LEGS_RE,
|
|
186
|
-
SCORE_FULL_RE,
|
|
98
|
+
SCORE_FULL_RE,
|
|
187
99
|
SCORE_FULLER_RE,
|
|
188
100
|
SCORE_FULLER_MORE_RE,
|
|
189
101
|
SCORE_AWD_RE, # (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc.
|
|
190
102
|
SCORE_ABD_RE, # (inline) score abandoned e.g. 2-1 abd.
|
|
191
103
|
SCORE_RE, ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!!
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
SCORE_TEAM_PEN_RE, ## e.g. 1 (2)
|
|
196
|
-
|
|
197
|
-
BASICS_RE,
|
|
104
|
+
|
|
105
|
+
VS_RE,
|
|
106
|
+
|
|
198
107
|
TEXT_RE,
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
## only match if nothing else matches (expect ANY)
|
|
202
|
-
SCORE_TEAM_NUM_RE, ## e.g. 0 or 1 or 9 or 11 etc. (<100)
|
|
108
|
+
|
|
109
|
+
%r{ (?<sym> [,@()-] ) }x, ## todo - check if "standalone" comma (,) in use?
|
|
203
110
|
ANY_RE,
|
|
204
111
|
)
|
|
205
112
|
|
|
@@ -210,25 +117,25 @@ RE = Regexp.union(
|
|
|
210
117
|
|
|
211
118
|
## ord (for ordinal number)
|
|
212
119
|
## e.g. (51) or (1) etc. - limit digits of number - why? why not???
|
|
213
|
-
|
|
120
|
+
|
|
214
121
|
START_WITH_ORD = %r{
|
|
215
|
-
\A
|
|
122
|
+
\A
|
|
216
123
|
[ ]* ## ignore leading spaces (if any)
|
|
217
124
|
(?<ord>
|
|
218
|
-
\(
|
|
219
|
-
(?<value>\d+)
|
|
125
|
+
\(
|
|
126
|
+
(?<value>\d+)
|
|
220
127
|
\)
|
|
221
128
|
)}ix
|
|
222
129
|
|
|
223
130
|
|
|
224
|
-
###
|
|
131
|
+
###
|
|
225
132
|
## e.g. 1930, 1986, 2002, 2010, 2022, 2026
|
|
226
133
|
## note - only YYYY
|
|
227
134
|
## note - look out for clubs like 1860 München (de) !!!
|
|
228
135
|
## 1899 Hoffenheim (de)
|
|
229
136
|
## 1896 Löwenherz (ch - a.k.a. FC Winterthur ??)
|
|
230
137
|
## any others starting with YYYY ?!
|
|
231
|
-
## note - YEAR requires TWO (trailing) spaces !!!!! e.g.
|
|
138
|
+
## note - YEAR requires TWO (trailing) spaces !!!!! e.g.
|
|
232
139
|
## 1930 Uruguay 4-2 Argentina
|
|
233
140
|
## 1934 Italy 2-1 Czechoslovakia (AET)
|
|
234
141
|
## 2022 Argentina 3-3 France (AET, 4-2 pen)
|
|
@@ -243,17 +150,17 @@ START_WITH_YEAR = %r{
|
|
|
243
150
|
(?<year>
|
|
244
151
|
\d{4}
|
|
245
152
|
)
|
|
246
|
-
## positive lookahead
|
|
247
|
-
(?= [ ]{2} | ## min. TWO spaces or
|
|
153
|
+
## positive lookahead
|
|
154
|
+
(?= [ ]{2} | ## min. TWO spaces!!! or
|
|
248
155
|
[ ]@ | ## space with geo marker or
|
|
249
156
|
[ ]* \z ## year (date) header (end-of-line/string)
|
|
250
|
-
)
|
|
157
|
+
)
|
|
251
158
|
}x
|
|
252
159
|
|
|
253
160
|
|
|
254
161
|
|
|
255
162
|
###
|
|
256
|
-
## check for headings
|
|
163
|
+
## check for headings
|
|
257
164
|
## e.g. = heading 1
|
|
258
165
|
## == heading 2 etc.
|
|
259
166
|
## =Eurochampionship=
|
|
@@ -265,28 +172,20 @@ START_WITH_YEAR = %r{
|
|
|
265
172
|
|
|
266
173
|
HEADING_RE = %r{ \A
|
|
267
174
|
[ ]* ## ignore leading spaces (if any)
|
|
268
|
-
(?<heading_marker> ={1,6} )
|
|
175
|
+
(?<heading_marker> ={1,6} )
|
|
269
176
|
[ ]*
|
|
270
177
|
(?<heading>
|
|
271
178
|
## must start with letter - why? why not?
|
|
272
179
|
### 1st round
|
|
273
|
-
## allow numbers e.g. Group A - 1
|
|
274
|
-
[^=]+? ## use non-greedy
|
|
180
|
+
## allow numbers e.g. Group A - 1
|
|
181
|
+
[^=]+? ## use non-greedy
|
|
275
182
|
)
|
|
276
183
|
[ ]* ## ignore trailing spaces (if any)
|
|
277
|
-
(?: =*
|
|
184
|
+
(?: =*) ## allow any trailing heading markers
|
|
278
185
|
[ ]* ## ignore trailing spaces (if any)
|
|
279
186
|
\z
|
|
280
187
|
}ix
|
|
281
188
|
|
|
282
189
|
|
|
283
|
-
HRULER_RE = %r{
|
|
284
|
-
\A
|
|
285
|
-
[ ]* ## ignore leading spaces (if any)
|
|
286
|
-
-{3,} ## must be at least three dashes!!!
|
|
287
|
-
[ ]* ## ignore trailing spaces (if any)
|
|
288
|
-
\z
|
|
289
|
-
}ix
|
|
290
|
-
|
|
291
190
|
end # class Lexer
|
|
292
191
|
end # module SportDb
|