sportdb-parser 0.5.8 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +2 -0
- data/config/rounds_en.txt +5 -0
- data/lib/sportdb/parser/lexer.rb +47 -28
- data/lib/sportdb/parser/parser.rb +421 -344
- data/lib/sportdb/parser/racc_parser.rb +1 -1
- data/lib/sportdb/parser/racc_tree.rb +12 -5
- data/lib/sportdb/parser/token-date.rb +18 -1
- data/lib/sportdb/parser/token-minute.rb +45 -0
- data/lib/sportdb/parser/token-prop.rb +133 -0
- data/lib/sportdb/parser/token-text.rb +14 -5
- data/lib/sportdb/parser/token.rb +49 -183
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +2 -0
- metadata +4 -2
@@ -14,7 +14,7 @@ def initialize( txt, debug: false )
|
|
14
14
|
|
15
15
|
### todo:
|
16
16
|
## - pass along debug flag
|
17
|
-
lexer = SportDb::Lexer.new( txt )
|
17
|
+
lexer = SportDb::Lexer.new( txt, debug: debug )
|
18
18
|
## note - use tokenize_with_errors and add/collect tokenize errors
|
19
19
|
@tokens, @errors = lexer.tokenize_with_errors
|
20
20
|
## pp @tokens
|
@@ -71,10 +71,14 @@ RoundDef = Struct.new( :name, :date, :duration ) do
|
|
71
71
|
end
|
72
72
|
end
|
73
73
|
|
74
|
-
DateHeader = Struct.new( :date ) do
|
74
|
+
DateHeader = Struct.new( :date, :time, :geo, :timezone ) do
|
75
75
|
def pretty_print( printer )
|
76
76
|
printer.text( "<DateHeader " )
|
77
|
-
printer.text( "#{self.date.pretty_inspect}
|
77
|
+
printer.text( "#{self.date.pretty_inspect}" )
|
78
|
+
printer.text( " time=#{self.time.pretty_inspect}" ) if self.time
|
79
|
+
printer.text( " geo=#{self.geo.pretty_inspect}" ) if self.geo
|
80
|
+
printer.text( " timezone=#{self.timezone}") if self.timezone
|
81
|
+
printer.text( ">")
|
78
82
|
end
|
79
83
|
end
|
80
84
|
|
@@ -85,14 +89,17 @@ GroupHeader = Struct.new( :name ) do
|
|
85
89
|
end
|
86
90
|
end
|
87
91
|
|
88
|
-
RoundHeader = Struct.new( :names ) do
|
92
|
+
RoundHeader = Struct.new( :names, :group ) do
|
89
93
|
def pretty_print( printer )
|
90
94
|
printer.text( "<RoundHeader " )
|
91
|
-
printer.text( "#{self.names.join(', ')}
|
95
|
+
printer.text( "#{self.names.join(', ')}" )
|
96
|
+
printer.text( " group=#{self.group}") if self.group
|
97
|
+
printer.text( ">" )
|
92
98
|
end
|
93
99
|
end
|
94
100
|
|
95
|
-
|
101
|
+
|
102
|
+
MatchLine = Struct.new( :ord, :date, :time, :wday,
|
96
103
|
:team1, :team2, :score,
|
97
104
|
:status,
|
98
105
|
:geo,
|
@@ -146,12 +146,29 @@ DATE_II_RE = %r{
|
|
146
146
|
)}ix
|
147
147
|
|
148
148
|
|
149
|
+
# e.g. iso-date - 2011-08-25
|
150
|
+
## todo/check - allow 2011-8-25 or 2011-8-3 / 2011-08-03 etc. - why? why not?
|
151
|
+
DATE_III_RE = %r{
|
152
|
+
(?<date>
|
153
|
+
\b
|
154
|
+
(?<year>\d{4})
|
155
|
+
-
|
156
|
+
(?<month>\d{2})
|
157
|
+
-
|
158
|
+
(?<day>\d{2})
|
159
|
+
\b
|
160
|
+
)}ix
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
149
165
|
#############################################
|
150
166
|
# map tables
|
151
167
|
# note: order matters; first come-first matched/served
|
152
168
|
DATE_RE = Regexp.union(
|
153
169
|
DATE_I_RE,
|
154
|
-
DATE_II_RE
|
170
|
+
DATE_II_RE,
|
171
|
+
DATE_III_RE,
|
155
172
|
)
|
156
173
|
|
157
174
|
|
@@ -0,0 +1,45 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Lexer
|
4
|
+
|
5
|
+
#
|
6
|
+
# todo/check - move goal type regexes to goal or somewhere else?
|
7
|
+
#
|
8
|
+
|
9
|
+
## goal types
|
10
|
+
# (pen.) or (pen) or (p.) or (p)
|
11
|
+
## (o.g.) or (og)
|
12
|
+
## todo/check - keep case-insensitive
|
13
|
+
## or allow OG or P or PEN or
|
14
|
+
## only lower case - why? why not?
|
15
|
+
GOAL_PEN_RE = %r{
|
16
|
+
(?<pen> \(
|
17
|
+
(?:pen|p)\.?
|
18
|
+
\)
|
19
|
+
)
|
20
|
+
}ix
|
21
|
+
GOAL_OG_RE = %r{
|
22
|
+
(?<og> \(
|
23
|
+
(?:og|o\.g\.)
|
24
|
+
\)
|
25
|
+
)
|
26
|
+
}ix
|
27
|
+
|
28
|
+
|
29
|
+
MINUTE_RE = %r{
|
30
|
+
(?<minute>
|
31
|
+
(?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required
|
32
|
+
# todo - add more lookbehinds e.g. ,) etc. - why? why not?
|
33
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
34
|
+
(?: \+
|
35
|
+
(?<value2>\d{1,3})
|
36
|
+
)?
|
37
|
+
' ## must have minute marker!!!!
|
38
|
+
)
|
39
|
+
}ix
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
end # module SportDb
|
45
|
+
end # class Lexer
|
@@ -0,0 +1,133 @@
|
|
1
|
+
###
|
2
|
+
## team prop mode e.g.
|
3
|
+
##
|
4
|
+
##
|
5
|
+
## Fri Jun 14 21:00 @ München Fußball Arena, München
|
6
|
+
## (1) Germany v Scotland 5-1 (3-0)
|
7
|
+
## Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.)
|
8
|
+
##
|
9
|
+
## Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
|
10
|
+
## Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
|
11
|
+
## Havertz (Füllkrug 63')
|
12
|
+
## Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
|
13
|
+
## McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
|
14
|
+
## Adams (Hanley 46'), McGinn (McLean 67')
|
15
|
+
|
16
|
+
|
17
|
+
module SportDb
|
18
|
+
class Lexer
|
19
|
+
|
20
|
+
|
21
|
+
## name different from text (does NOT allow number in name/text)
|
22
|
+
|
23
|
+
PROP_NAME_RE = %r{
|
24
|
+
(?<prop_name> \b
|
25
|
+
(?<name>
|
26
|
+
\p{L}+
|
27
|
+
\.? ## optional dot
|
28
|
+
(?:
|
29
|
+
[ ]? # only single spaces allowed inline!!!
|
30
|
+
(?:
|
31
|
+
(?:
|
32
|
+
(?<=\p{L}) ## use lookbehind
|
33
|
+
[/'-] ## must be surrounded by letters
|
34
|
+
## e.g. One/Two NOT
|
35
|
+
## One/ Two or One / Two or One /Two etc.
|
36
|
+
(?=\p{L}) ## use lookahead
|
37
|
+
)
|
38
|
+
|
|
39
|
+
(?:
|
40
|
+
(?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
|
41
|
+
['] ## must be surrounded by leading space and
|
42
|
+
## traling letters (e.g. UDI 'Beter Bed)
|
43
|
+
(?=\p{L}) ## use lookahead
|
44
|
+
)
|
45
|
+
|
|
46
|
+
(?:
|
47
|
+
(?<=\p{L}) ## use lookbehind
|
48
|
+
['] ## must be surrounded by leading letter and
|
49
|
+
## trailing space PLUS letter (e.g. UDI' Beter Bed)
|
50
|
+
(?=[ ]\p{L}) ## use lookahead (space WITH letter
|
51
|
+
)
|
52
|
+
| ## standard case with letter(s) and optinal dot
|
53
|
+
(?: \p{L}+
|
54
|
+
\.? ## optional dot
|
55
|
+
)
|
56
|
+
)+
|
57
|
+
)*
|
58
|
+
)
|
59
|
+
## add lookahead - must be non-alphanum
|
60
|
+
(?=[ ,;\]\)]|$)
|
61
|
+
)
|
62
|
+
}ix
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
##############
|
68
|
+
# add support for props/ attributes e.g.
|
69
|
+
#
|
70
|
+
# Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
|
71
|
+
# Kroos (80' Can) - Musiala (74' Müller), Gündogan,
|
72
|
+
# Wirtz (63' Sane) - Havertz (63' Füllkrug)
|
73
|
+
# Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (78' McKenna) - Ralston [Y],
|
74
|
+
# McTominay, McGregor (67' Gilmour), Robertson - Christie (82' Shankland),
|
75
|
+
# Adams (46' Hanley), McGinn (67' McLean)
|
76
|
+
#
|
77
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
78
|
+
## make sure mon feb 12 18:10 will not match
|
79
|
+
## allow 1. FC Köln etc.
|
80
|
+
## Mainz 05:
|
81
|
+
## limit to 30 chars max
|
82
|
+
## only allow chars incl. intl but (NOT ()[]/;)
|
83
|
+
##
|
84
|
+
## todo/fix:
|
85
|
+
## check if St. Pölten works; with starting St. ???
|
86
|
+
|
87
|
+
|
88
|
+
PROP_KEY_RE = %r{
|
89
|
+
(?<prop_key> \b
|
90
|
+
(?<key>
|
91
|
+
(?:\p{L}+
|
92
|
+
|
|
93
|
+
\d+ # check for num lookahead (MUST be space or dot)
|
94
|
+
## MUST be followed by (optional dot) and
|
95
|
+
## required space !!!
|
96
|
+
## MUST be follow by a to z!!!!
|
97
|
+
\.? ## optional dot
|
98
|
+
[ ]? ## make space optional too - why? why not?
|
99
|
+
## yes - eg. 1st, 2nd, 5th etc.
|
100
|
+
\p{L}+
|
101
|
+
)
|
102
|
+
[\d\p{L}'/° -]*? ## allow almost anyting
|
103
|
+
## fix - add negative lookahead
|
104
|
+
## no space and dash etc.
|
105
|
+
## only allowed "inline" not at the end
|
106
|
+
## must end with latter or digit!
|
107
|
+
)
|
108
|
+
[ ]*? # slurp trailing spaces
|
109
|
+
:
|
110
|
+
(?=[ ]+) ## possitive lookahead (must be followed by space!!)
|
111
|
+
)
|
112
|
+
}ix
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
PROP_BASICS_RE = %r{
|
117
|
+
(?<spaces> [ ]{2,}) |
|
118
|
+
(?<space> [ ])
|
119
|
+
|
|
120
|
+
(?<sym>
|
121
|
+
[;,\(\)\[\]-]
|
122
|
+
)
|
123
|
+
}ix
|
124
|
+
|
125
|
+
PROP_RE = Regexp.union(
|
126
|
+
PROP_BASICS_RE,
|
127
|
+
MINUTE_RE,
|
128
|
+
PROP_NAME_RE,
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
end # class Lexer
|
133
|
+
end # module SportDb
|
@@ -24,6 +24,13 @@ class Lexer
|
|
24
24
|
# allow Cote'd Ivoir or such
|
25
25
|
## e.g. add '
|
26
26
|
|
27
|
+
## note:
|
28
|
+
## make sure these do NOT match!!!
|
29
|
+
## TEXT => "Matchday 1 / Group A"
|
30
|
+
## TEXT => "Matchday 2 / Group A"
|
31
|
+
## TEXT => "Matchday 3 / Group A"
|
32
|
+
|
33
|
+
|
27
34
|
|
28
35
|
TEXT_RE = %r{
|
29
36
|
## must start with alpha (allow unicode letters!!)
|
@@ -53,15 +60,17 @@ TEXT_RE = %r{
|
|
53
60
|
\d+\.-\d+\. [ ]? \p{L}+
|
54
61
|
)
|
55
62
|
|
56
|
-
(?:(?: (?:[ ]
|
57
|
-
(?!
|
58
|
-
|
63
|
+
(?:(?: (?:[ ]
|
64
|
+
(?! (?-i: vs?[ ])
|
65
|
+
) ## note - exclude (v[ ]/vs[ ])
|
66
|
+
## AND switch to case-sensitive (via -i!!!)
|
67
|
+
)
|
59
68
|
| # only single spaces allowed inline!!!
|
60
|
-
[
|
69
|
+
[-/]
|
61
70
|
)?
|
62
71
|
(?:
|
63
72
|
\p{L} |
|
64
|
-
[
|
73
|
+
[&'°]
|
65
74
|
|
|
66
75
|
(?:
|
67
76
|
\d+
|
data/lib/sportdb/parser/token.rb
CHANGED
@@ -7,13 +7,14 @@ class Lexer
|
|
7
7
|
##
|
8
8
|
# keep 18h30 - why? why not?
|
9
9
|
# add support for 6:30pm 8:20am etc. - why? why not?
|
10
|
-
|
10
|
+
#
|
11
|
+
# check - only support h e.g. 18h30 or 18H30 too - why? why not?
|
12
|
+
# e.g. 18.30 (or 18:30 or 18h30)
|
11
13
|
TIME_RE = %r{
|
12
|
-
## e.g. 18.30 (or 18:30 or 18h30)
|
13
14
|
(?<time> \b
|
14
|
-
|
15
|
+
(?: (?<hour>\d{1,2})
|
15
16
|
(?: :|\.|h )
|
16
|
-
(?<minute>\d{2})
|
17
|
+
(?<minute>\d{2}))
|
17
18
|
\b
|
18
19
|
)
|
19
20
|
}ix
|
@@ -42,9 +43,12 @@ TIME_RE = %r{
|
|
42
43
|
# https://en.wikipedia.org/wiki/Time_zone
|
43
44
|
# https://en.wikipedia.org/wiki/List_of_UTC_offsets
|
44
45
|
# https://en.wikipedia.org/wiki/UTC−04:00 etc.
|
45
|
-
|
46
|
+
#
|
47
|
+
# e.g. (UTC-2) or (CEST/UTC-2) etc.
|
48
|
+
# todo check - only allow upcase
|
49
|
+
# or (utc-2) and (cest/utc-2) too - why? why not?
|
50
|
+
|
46
51
|
TIMEZONE_RE = %r{
|
47
|
-
## e.g. (UTC-2) or (CEST/UTC-2) etc.
|
48
52
|
(?<timezone>
|
49
53
|
\(
|
50
54
|
## optional "local" timezone name eg. BRT or CEST etc.
|
@@ -60,207 +64,69 @@ TIMEZONE_RE = %r{
|
|
60
64
|
|
61
65
|
|
62
66
|
|
67
|
+
## add wday / stand-alone week day - as separate regex or
|
68
|
+
## use TEXT with is_wday? check or such with
|
69
|
+
## requirement of beginning of line (anchored to line) only??
|
70
|
+
## - why? why not?
|
71
|
+
|
72
|
+
WDAY_RE = %r{
|
73
|
+
(?<wday>
|
74
|
+
\b # note - alternation (|) is lowest precedence (such
|
75
|
+
# parathenes required around \b()\b !!!
|
76
|
+
## note - NOT case sensitive!!!
|
77
|
+
(?<day_name>
|
78
|
+
(?-i:
|
79
|
+
Mon|Mo|
|
80
|
+
Tue|Tu|
|
81
|
+
Wed|We|
|
82
|
+
Thu|Th|
|
83
|
+
Fri|Fr|
|
84
|
+
Sat|Sa|
|
85
|
+
Sun|Su
|
86
|
+
))
|
87
|
+
\b ## todo/check - must be followed by two spaces or space + [( etc.
|
88
|
+
## to allow words starting with weekday abbrevations - why? why not?
|
89
|
+
## check if any names (teams, rounds, etc) come up in practice
|
90
|
+
## or maybe remove three letter abbrevations Mon/Tue
|
91
|
+
## and keep only Mo/Tu/We etc. - why? why not?
|
92
|
+
)}x
|
93
|
+
|
94
|
+
|
95
|
+
|
63
96
|
|
64
97
|
BASICS_RE = %r{
|
65
98
|
## e.g. (51) or (1) etc. - limit digits of number???
|
99
|
+
## todo/fix - change num to ord (for ordinal number)!!!!!
|
66
100
|
(?<num> \( (?<value>\d+) \) )
|
67
101
|
|
|
68
102
|
(?<vs>
|
69
|
-
(?<=[ ]) #
|
70
|
-
(
|
71
|
-
|
72
|
-
)
|
73
|
-
|
74
|
-
# todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
|
103
|
+
(?<=[ ]) # positive lookbehind for space
|
104
|
+
(?-i:
|
105
|
+
vs|v
|
106
|
+
) # note - only match case sensitive (downcased letters)!!!
|
107
|
+
# note - bigger match first e.g. vs than v etc.
|
75
108
|
(?=[ ]) # positive lookahead for space
|
76
109
|
)
|
77
110
|
|
|
78
111
|
(?<spaces> [ ]{2,}) |
|
79
112
|
(?<space> [ ])
|
80
113
|
|
|
81
|
-
(?<sym>[
|
82
|
-
}ix
|
83
|
-
|
84
|
-
|
85
|
-
## removed from basics
|
86
|
-
=begin
|
87
|
-
(?<none>
|
88
|
-
(?<=[ \[]|^) # Positive lookbehind for space or [
|
89
|
-
-
|
90
|
-
(?=[ ]*;) # positive lookahead for space
|
91
|
-
)
|
92
|
-
|
|
93
|
-
(?<vs>
|
94
|
-
(?<=[ ]) # Positive lookbehind for space
|
95
|
-
(?:
|
96
|
-
vs\.?| ## allow optional dot (eg. vs. v.)
|
97
|
-
v\.?|
|
98
|
-
-
|
99
|
-
) # not bigger match first e.g. vs than v etc.
|
100
|
-
(?=[ ]) # positive lookahead for space
|
101
|
-
)
|
102
|
-
|
|
103
|
-
|
104
|
-
make - into a simple symbol !!!
|
105
|
-
=end
|
106
|
-
|
107
|
-
|
108
|
-
MINUTE_RE = %r{
|
109
|
-
(?<minute>
|
110
|
-
(?<=[ (]) # Positive lookbehind for space or opening ( e.g. (61') required
|
111
|
-
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
112
|
-
(?: \+
|
113
|
-
(?<value2>\d{1,3})
|
114
|
-
)?
|
115
|
-
' ## must have minute marker!!!!
|
116
|
-
)
|
117
|
-
}ix
|
118
|
-
|
119
|
-
|
120
|
-
## goal types
|
121
|
-
# (pen.) or (pen) or (p.) or (p)
|
122
|
-
## (o.g.) or (og)
|
123
|
-
GOAL_PEN_RE = %r{
|
124
|
-
(?<pen> \(
|
125
|
-
(?:pen|p)\.?
|
126
|
-
\)
|
127
|
-
)
|
128
|
-
}ix
|
129
|
-
GOAL_OG_RE = %r{
|
130
|
-
(?<og> \(
|
131
|
-
(?:og|o\.g\.)
|
132
|
-
\)
|
133
|
-
)
|
114
|
+
(?<sym>[;,/@|\[\]-])
|
134
115
|
}ix
|
135
116
|
|
136
117
|
|
137
118
|
|
138
119
|
|
139
|
-
|
140
|
-
|
141
|
-
PROP_BASICS_RE = %r{
|
142
|
-
(?<spaces> [ ]{2,}) |
|
143
|
-
(?<space> [ ])
|
144
|
-
|
|
145
|
-
(?<sym>[.;,\(\)\[\]-]) ## note - dot (.) is the (all-important) end-of-prop marker!!!
|
146
|
-
}ix
|
147
|
-
|
148
|
-
|
149
|
-
## name different from text (does not allow number in name/text)
|
150
|
-
##
|
151
|
-
## note - includes special handling for dot (.) if at the end of line!!!
|
152
|
-
## end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
|
153
|
-
|
154
|
-
PROP_NAME_RE = %r{
|
155
|
-
(?<prop_name> \b
|
156
|
-
(?<name>
|
157
|
-
\p{L}+
|
158
|
-
(?: \. (?: (?![ ]*$) )
|
159
|
-
)? ## edge case - check for end of prop marker! (e.g. Stop.)
|
160
|
-
(?:
|
161
|
-
[ ]? # only single spaces allowed inline!!!
|
162
|
-
(?:
|
163
|
-
(?:
|
164
|
-
(?<=\p{L}) ## use lookbehind
|
165
|
-
[/'-] ## must be surrounded by letters
|
166
|
-
## e.g. One/Two NOT
|
167
|
-
## One/ Two or One / Two or One /Two etc.
|
168
|
-
(?=\p{L}) ## use lookahead
|
169
|
-
)
|
170
|
-
|
|
171
|
-
(?:
|
172
|
-
(?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not?
|
173
|
-
['] ## must be surrounded by leading space and
|
174
|
-
## traling letters (e.g. UDI 'Beter Bed)
|
175
|
-
(?=\p{L}) ## use lookahead
|
176
|
-
)
|
177
|
-
|
|
178
|
-
(?:
|
179
|
-
(?<=\p{L}) ## use lookbehind
|
180
|
-
['] ## must be surrounded by leading letter and
|
181
|
-
## trailing space PLUS letter (e.g. UDI' Beter Bed)
|
182
|
-
(?=[ ]\p{L}) ## use lookahead (space WITH letter
|
183
|
-
)
|
184
|
-
|
|
185
|
-
(?: \p{L}+
|
186
|
-
(?: \.
|
187
|
-
(?: (?![ ]*$) )
|
188
|
-
)? ## last dot is delimiter!!!
|
189
|
-
)
|
190
|
-
)+
|
191
|
-
)*
|
192
|
-
)
|
193
|
-
## add lookahead - must be non-alphanum (or dot)
|
194
|
-
(?=[ .,;\]\)]|$)
|
195
|
-
)
|
196
|
-
}ix
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
##############
|
202
|
-
# add support for props/ attributes e.g.
|
203
|
-
#
|
204
|
-
# Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt – Andrich [Y] (46' Groß),
|
205
|
-
# Kroos (80' Can) – Musiala (74' Müller), Gündogan,
|
206
|
-
# Wirtz (63' Sane) – Havertz (63' Füllkrug).
|
207
|
-
# Scotland: Gunn – Porteous [R 44'], Hendry, Tierney (78' McKenna) – Ralston [Y],
|
208
|
-
# McTominay, McGregor (67' Gilmour), Robertson – Christie (82' Shankland),
|
209
|
-
# Adams (46' Hanley), McGinn (67' McLean).
|
210
|
-
#
|
211
|
-
## note: colon (:) MUST be followed by one (or more) spaces
|
212
|
-
## make sure mon feb 12 18:10 will not match
|
213
|
-
## allow 1. FC Köln etc.
|
214
|
-
## Mainz 05:
|
215
|
-
## limit to 30 chars max
|
216
|
-
## only allow chars incl. intl but (NOT ()[]/;)
|
217
|
-
|
218
|
-
|
219
|
-
PROP_KEY_RE = %r{
|
220
|
-
(?<prop_key> \b
|
221
|
-
(?<key>
|
222
|
-
(?:\p{L}+
|
223
|
-
|
|
224
|
-
\d+ # check for num lookahead (MUST be space or dot)
|
225
|
-
## MUST be followed by (optional dot) and
|
226
|
-
## required space !!!
|
227
|
-
## MUST be follow by a to z!!!!
|
228
|
-
\.? ## optional dot
|
229
|
-
[ ]? ## make space optional too - why? why not?
|
230
|
-
## yes - eg. 1st, 2nd, 5th etc.
|
231
|
-
\p{L}+
|
232
|
-
)
|
233
|
-
[\d\p{L}'/° -]*? ## allow almost anyting
|
234
|
-
## fix - add negative lookahead
|
235
|
-
## no space and dash etc.
|
236
|
-
## only allowed "inline" not at the end
|
237
|
-
## must end with latter or digit!
|
238
|
-
)
|
239
|
-
[ ]*? # slurp trailing spaces
|
240
|
-
:
|
241
|
-
(?=[ ]+) ## possitive lookahead (must be followed by space!!)
|
242
|
-
)
|
243
|
-
}ix
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
PROP_RE = Regexp.union(
|
249
|
-
PROP_BASICS_RE,
|
250
|
-
MINUTE_RE,
|
251
|
-
PROP_NAME_RE,
|
252
|
-
)
|
253
|
-
|
254
|
-
|
255
|
-
|
256
120
|
RE = Regexp.union( PROP_KEY_RE, ## start with prop key (match will/should switch into prop mode!!!)
|
257
121
|
STATUS_RE,
|
258
122
|
TIMEZONE_RE,
|
259
123
|
TIME_RE,
|
260
124
|
DURATION_RE, # note - duration MUST match before date
|
261
125
|
DATE_RE,
|
126
|
+
WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
|
262
127
|
SCORE_RE,
|
263
|
-
BASICS_RE,
|
128
|
+
BASICS_RE,
|
129
|
+
MINUTE_RE,
|
264
130
|
GOAL_OG_RE, GOAL_PEN_RE,
|
265
131
|
TEXT_RE )
|
266
132
|
|
data/lib/sportdb/parser.rb
CHANGED
@@ -21,6 +21,8 @@ require_relative 'parser/token-score'
|
|
21
21
|
require_relative 'parser/token-date'
|
22
22
|
require_relative 'parser/token-text'
|
23
23
|
require_relative 'parser/token-status'
|
24
|
+
require_relative 'parser/token-minute'
|
25
|
+
require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
|
24
26
|
require_relative 'parser/token'
|
25
27
|
require_relative 'parser/lexer'
|
26
28
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -102,6 +102,8 @@ files:
|
|
102
102
|
- lib/sportdb/parser/racc_parser.rb
|
103
103
|
- lib/sportdb/parser/racc_tree.rb
|
104
104
|
- lib/sportdb/parser/token-date.rb
|
105
|
+
- lib/sportdb/parser/token-minute.rb
|
106
|
+
- lib/sportdb/parser/token-prop.rb
|
105
107
|
- lib/sportdb/parser/token-score.rb
|
106
108
|
- lib/sportdb/parser/token-status.rb
|
107
109
|
- lib/sportdb/parser/token-text.rb
|