sportdb-parser 0.6.20 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +14 -8
- data/Rakefile +1 -1
- data/lib/sportdb/parser/blocktxt.rb +99 -0
- data/lib/sportdb/parser/lexer.rb +958 -395
- data/lib/sportdb/parser/lexer_buffer.rb +97 -0
- data/lib/sportdb/parser/lexer_tty.rb +111 -0
- data/lib/sportdb/parser/parser.rb +1768 -855
- data/lib/sportdb/parser/racc_parser.rb +1 -1
- data/lib/sportdb/parser/racc_tree.rb +327 -41
- data/lib/sportdb/parser/token-date.rb +160 -178
- data/lib/sportdb/parser/token-date_duration.rb +190 -0
- data/lib/sportdb/parser/token-geo.rb +59 -59
- data/lib/sportdb/parser/token-goals.rb +460 -0
- data/lib/sportdb/parser/token-group.rb +43 -0
- data/lib/sportdb/parser/token-note.rb +40 -0
- data/lib/sportdb/parser/token-prop.rb +70 -54
- data/lib/sportdb/parser/token-prop_name.rb +74 -0
- data/lib/sportdb/parser/token-round.rb +102 -0
- data/lib/sportdb/parser/token-score.rb +323 -47
- data/lib/sportdb/parser/token-score_fuller.rb +435 -0
- data/lib/sportdb/parser/token-score_legs.rb +59 -0
- data/lib/sportdb/parser/token-status.rb +157 -160
- data/lib/sportdb/parser/token-table.rb +149 -0
- data/lib/sportdb/parser/token-text.rb +72 -23
- data/lib/sportdb/parser/token-time.rb +141 -0
- data/lib/sportdb/parser/token.rb +242 -105
- data/lib/sportdb/parser/token_helpers.rb +92 -0
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +24 -2
- metadata +18 -18
- data/config/rounds_de.txt +0 -125
- data/config/rounds_en.txt +0 -29
- data/config/rounds_es.txt +0 -26
- data/config/rounds_misc.txt +0 -25
- data/config/rounds_pt.txt +0 -4
- data/config/zones_en.txt +0 -20
- data/lib/sportdb/parser/lang.rb +0 -298
- data/lib/sportdb/parser/token-minute.rb +0 -205
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
##
|
|
7
|
+
# keep 18h30 - why? why not?
|
|
8
|
+
# add support for 6:30pm 8:20am etc. - why? why not?
|
|
9
|
+
#
|
|
10
|
+
# check - only support h e.g. 18h30 or 18H30 too - why? why not?
|
|
11
|
+
# e.g. 18:30 (or 18h30)
|
|
12
|
+
# note - optional timezone possible e.g.
|
|
13
|
+
# 18:30 UTC+1 or 18:30 BST/UTC+1 or such!!!
|
|
14
|
+
# 18:30 UTC+01 or 18:30 BST/UTC+01
|
|
15
|
+
#
|
|
16
|
+
#
|
|
17
|
+
# note 18.30 no longer supported - MUST use 18:30 or 18h30 !!!
|
|
18
|
+
#
|
|
19
|
+
#
|
|
20
|
+
#
|
|
21
|
+
# note - local time is now (inline) part of time!!!
|
|
22
|
+
# and, thus, must always follow time
|
|
23
|
+
# e.g. 18:30 (19:30 BST)
|
|
24
|
+
#
|
|
25
|
+
## local time e.g (19:30 UTC+1) or (19:30 BST/UTC+1) or
|
|
26
|
+
## note - timezone is optional! e.g. (19:30) works too
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
TIME_RE = %r{
|
|
30
|
+
\b
|
|
31
|
+
(?<time>
|
|
32
|
+
(?<hour>\d{1,2})
|
|
33
|
+
[:h]
|
|
34
|
+
(?<minute>\d{2})
|
|
35
|
+
|
|
36
|
+
#### optional (inline) timezone
|
|
37
|
+
## note - non-utc timezone MUST be hard-coded (added) here!!!
|
|
38
|
+
## avoids eating-up team names (separated by one space)
|
|
39
|
+
## e.g. 18:30 MEX v MEX
|
|
40
|
+
(?:
|
|
41
|
+
[ ] ## require space - why? why not
|
|
42
|
+
(?<timezone>
|
|
43
|
+
(?:
|
|
44
|
+
## GMT - Greenwich Mean Time
|
|
45
|
+
## BST - British Summer Time
|
|
46
|
+
## CES?T - Central European (Summer) Time
|
|
47
|
+
## EES?T - Eastern European (Summer) Time
|
|
48
|
+
##
|
|
49
|
+
(?: GMT|BST|CES?T|EES?T)
|
|
50
|
+
(?: /
|
|
51
|
+
UTC (?: [+-]\d{1,4} | ±0)
|
|
52
|
+
)?
|
|
53
|
+
)
|
|
54
|
+
|
|
|
55
|
+
(?:
|
|
56
|
+
UTC (?: [+-]\d{1,4} | ±0)
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
)?
|
|
60
|
+
)
|
|
61
|
+
\b
|
|
62
|
+
|
|
63
|
+
####
|
|
64
|
+
### note - local time is now INLINE and MUST follow time
|
|
65
|
+
(?:
|
|
66
|
+
[ ]+ ## todo/check - make space optional - why? why not?
|
|
67
|
+
\(
|
|
68
|
+
(?<time_local>
|
|
69
|
+
(?<local_hour>\d{1,2})
|
|
70
|
+
[:h] ### todo/fix - MUST match style in time above!!!
|
|
71
|
+
(?<local_minute>\d{2})
|
|
72
|
+
|
|
73
|
+
####
|
|
74
|
+
## optional "local" timezone name eg. BRT or CEST etc.
|
|
75
|
+
(?:
|
|
76
|
+
[ ] ## require space - why? why not
|
|
77
|
+
(?<local_timezone>
|
|
78
|
+
(?: [A-Z]{3,4}
|
|
79
|
+
(?: /
|
|
80
|
+
UTC (?: [+-]\d{1,4} | ±0)
|
|
81
|
+
)?
|
|
82
|
+
)
|
|
83
|
+
|
|
|
84
|
+
(?: ## e.g. 0 or 00 or 0000
|
|
85
|
+
UTC (?: [+-]\d{1,4} | ±0)
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
)? # note - make timezone optional!!!
|
|
89
|
+
)
|
|
90
|
+
\)
|
|
91
|
+
)?
|
|
92
|
+
}ix
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def self._build_time( m )
|
|
96
|
+
## unify to iso-format
|
|
97
|
+
### 12.40 => 12:40
|
|
98
|
+
## 12h40 => 12:40 etc.
|
|
99
|
+
## keep string (no time-only type in ruby)
|
|
100
|
+
data = { time: {} }
|
|
101
|
+
|
|
102
|
+
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
|
103
|
+
minute = m[:minute].to_i(10)
|
|
104
|
+
|
|
105
|
+
## check if 24:00 possible? or only 0:00 (23:59)
|
|
106
|
+
unless (hour >=0 && hour <=23) &&
|
|
107
|
+
(minute >=0 && minute <=59)
|
|
108
|
+
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
data[:time][:h] = hour
|
|
112
|
+
data[:time][:m] = minute
|
|
113
|
+
data[:time][:timezone] = m[:timezone] if m[:timezone]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
## check if local time present e.g.
|
|
117
|
+
## 18:30 (19:30)
|
|
118
|
+
## 18:30 (19:30 BST) etc.
|
|
119
|
+
if m[:time_local]
|
|
120
|
+
data[:time_local] = {}
|
|
121
|
+
|
|
122
|
+
local_hour = m[:local_hour].to_i(10) ## allow 08/07/etc.
|
|
123
|
+
local_minute = m[:local_minute].to_i(10)
|
|
124
|
+
|
|
125
|
+
## check if 24:00 possible? or only 0:00 (23:59)
|
|
126
|
+
unless (hour >=0 && hour <=23) &&
|
|
127
|
+
(minute >=0 && minute <=59)
|
|
128
|
+
raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
data[:time_local][:h] = local_hour
|
|
132
|
+
data[:time_local][:m] = local_minute
|
|
133
|
+
data[:time_local][:timezone] = m[:local_timezone] if m[:local_timezone]
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
data
|
|
137
|
+
end
|
|
138
|
+
def _build_time(m) self.class._build_time(m); end
|
|
139
|
+
|
|
140
|
+
end # class Lexer
|
|
141
|
+
end # module SportDb
|
data/lib/sportdb/parser/token.rb
CHANGED
|
@@ -4,63 +4,12 @@ module SportDb
|
|
|
4
4
|
class Lexer
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
##
|
|
8
|
-
# keep 18h30 - why? why not?
|
|
9
|
-
# add support for 6:30pm 8:20am etc. - why? why not?
|
|
10
|
-
#
|
|
11
|
-
# check - only support h e.g. 18h30 or 18H30 too - why? why not?
|
|
12
|
-
# e.g. 18.30 (or 18:30 or 18h30)
|
|
13
|
-
TIME_RE = %r{
|
|
14
|
-
(?<time> \b
|
|
15
|
-
(?: (?<hour>\d{1,2})
|
|
16
|
-
(?: :|\.|h )
|
|
17
|
-
(?<minute>\d{2}))
|
|
18
|
-
\b
|
|
19
|
-
)
|
|
20
|
-
}ix
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
## add wday / stand-alone week day - as separate regex or
|
|
25
|
-
## use TEXT with is_wday? check or such with
|
|
26
|
-
## requirement of beginning of line (anchored to line) only??
|
|
27
|
-
## - why? why not?
|
|
28
|
-
|
|
29
|
-
WDAY_RE = %r{
|
|
30
|
-
(?<wday>
|
|
31
|
-
\b # note - alternation (|) is lowest precedence (such
|
|
32
|
-
# parathenes required around \b()\b !!!
|
|
33
|
-
## note - NOT case sensitive!!!
|
|
34
|
-
(?<day_name>
|
|
35
|
-
(?-i:
|
|
36
|
-
Mon|Mo|
|
|
37
|
-
Tue|Tu|
|
|
38
|
-
Wed|We|
|
|
39
|
-
Thu|Th|
|
|
40
|
-
Fri|Fr|
|
|
41
|
-
Sat|Sa|
|
|
42
|
-
Sun|Su
|
|
43
|
-
))
|
|
44
|
-
(?=[ ]{2}) # positive lookahead for two space
|
|
45
|
-
## todo/check - must be followed by two spaces or space + [( etc.
|
|
46
|
-
## to allow words starting with weekday abbrevations - why? why not?
|
|
47
|
-
## check if any names (teams, rounds, etc) come up in practice
|
|
48
|
-
## or maybe remove three letter abbrevations Mon/Tue
|
|
49
|
-
## and keep only Mo/Tu/We etc. - why? why not?
|
|
50
|
-
)}x
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
7
|
|
|
55
8
|
BASICS_RE = %r{
|
|
56
|
-
## e.g. (51) or (1) etc. - limit digits of number???
|
|
57
|
-
## todo/fix - change num to ord (for ordinal number)!!!!!
|
|
58
|
-
(?<num> \( (?<value>\d+) \) )
|
|
59
|
-
|
|
|
60
9
|
(?<vs>
|
|
61
10
|
(?<=[ ]) # positive lookbehind for space
|
|
62
11
|
(?-i:
|
|
63
|
-
|
|
12
|
+
vs\.?|v|VS
|
|
64
13
|
) # note - only match case sensitive (downcased letters)!!!
|
|
65
14
|
# note - bigger match first e.g. vs than v etc.
|
|
66
15
|
(?=[ ]) # positive lookahead for space
|
|
@@ -69,87 +18,275 @@ BASICS_RE = %r{
|
|
|
69
18
|
(?<spaces> [ ]{2,}) |
|
|
70
19
|
(?<space> [ ])
|
|
71
20
|
|
|
|
72
|
-
(?<sym>
|
|
73
|
-
(?: ----|
|
|
74
|
-
---|
|
|
75
|
-
--
|
|
76
|
-
)
|
|
77
|
-
(?=[ ]) ## positive lookahead
|
|
78
|
-
)
|
|
79
|
-
|
|
|
80
|
-
(?<sym> [;,/@|\[\]-] )
|
|
21
|
+
(?<sym> [,;/@|()\[\]-] ) ### note: add parantheses too e.g () - why? why not?
|
|
81
22
|
}ix
|
|
82
23
|
|
|
83
24
|
|
|
84
25
|
|
|
26
|
+
|
|
27
|
+
###
|
|
28
|
+
## add att(endance) e.g. att: 18000
|
|
29
|
+
##
|
|
30
|
+
## A v B 2-1 att: 18000
|
|
31
|
+
|
|
32
|
+
ATTENDANCE_RE = %r{
|
|
33
|
+
(?<attendance>
|
|
34
|
+
\b
|
|
35
|
+
att: [ ]*
|
|
36
|
+
(?<value>
|
|
37
|
+
[1-9]
|
|
38
|
+
(?: _? \d+ )*
|
|
39
|
+
)
|
|
40
|
+
\b
|
|
41
|
+
)}ix
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## "inline" match status e.g.
|
|
45
|
+
## Clapham Rovers w/o Hitchin
|
|
46
|
+
## Queen's Park bye
|
|
47
|
+
|
|
48
|
+
## add support for WO or W-0 too - why? why not?
|
|
49
|
+
INLINE_WO_RE = %r{
|
|
50
|
+
(?<inline_wo>
|
|
51
|
+
\b (?: w/o | W/O ) \b
|
|
52
|
+
)}x ## note - NOT case insensitive
|
|
53
|
+
|
|
54
|
+
INLINE_BYE_RE = %r{
|
|
55
|
+
(?<inline_bye>
|
|
56
|
+
\b (?: bye | BYE ) \b
|
|
57
|
+
)}x ## note - NOT case insensitive
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
###
|
|
61
|
+
# A n/p B (note - basically a inline short form of A v B [cancelled] )
|
|
62
|
+
# N/P
|
|
63
|
+
INLINE_NP_RE = %r{
|
|
64
|
+
(?<inline_np>
|
|
65
|
+
\b (?: n/p | N/P ) \b
|
|
66
|
+
)}x ## note - NOT case insensitive
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
###
|
|
70
|
+
# abd/abd. or aban/aban. [abandoned]
|
|
71
|
+
# ABD/ABAN
|
|
72
|
+
INLINE_ABD_RE = %r{
|
|
73
|
+
(?<inline_abd>
|
|
74
|
+
\b (?: abd\.? |
|
|
75
|
+
aban\.? |
|
|
76
|
+
ABD | ABAN
|
|
77
|
+
)
|
|
78
|
+
## POSITIVE lookahead - requires space
|
|
79
|
+
(?= [ ])
|
|
80
|
+
)}x ## note - NOT case insensitive
|
|
81
|
+
|
|
82
|
+
####
|
|
83
|
+
# susp/susp. [suspended]
|
|
84
|
+
# SUSP
|
|
85
|
+
INLINE_SUSP_RE = %r{
|
|
86
|
+
(?<inline_susp>
|
|
87
|
+
\b (?: susp\.? |
|
|
88
|
+
SUSP )
|
|
89
|
+
## POSITIVE lookahead - requires space
|
|
90
|
+
(?= [ ])
|
|
91
|
+
)}x ## note - NOT case insensitive
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
####
|
|
95
|
+
# ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed]
|
|
96
|
+
# PPD/PSTP/POSTP/P-P
|
|
97
|
+
# todo/check - add/allow p-p too - why? why not?
|
|
98
|
+
INLINE_PPD_RE = %r{
|
|
99
|
+
(?<inline_ppd>
|
|
100
|
+
\b (?: ppd\.? |
|
|
101
|
+
pst\.? |
|
|
102
|
+
po?stp\.? |
|
|
103
|
+
PPD | PST | PO?STP | P-P
|
|
104
|
+
)
|
|
105
|
+
## POSITIVE lookahead - requires space
|
|
106
|
+
(?= [ ])
|
|
107
|
+
)}x ## note - NOT case insensitive
|
|
108
|
+
|
|
109
|
+
####
|
|
110
|
+
# void via x-x X-X
|
|
111
|
+
# todo/check - only allow X-X - why? why not?
|
|
112
|
+
INLINE_VOID_RE = %r{
|
|
113
|
+
(?<inline_void>
|
|
114
|
+
\b (?: x-x |
|
|
115
|
+
X-X
|
|
116
|
+
)
|
|
117
|
+
## POSITIVE lookahead - requires space
|
|
118
|
+
(?= [ ])
|
|
119
|
+
)}x ## note - NOT case insensitive
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
####
|
|
123
|
+
# awd/awd. [awarded]
|
|
124
|
+
# AWD
|
|
125
|
+
# note - recommendation is to allways include score
|
|
126
|
+
# thus, use/prefer SCORE_AWD e.g. 0-3 awd
|
|
127
|
+
INLINE_AWD_RE = %r{
|
|
128
|
+
(?<inline_awd>
|
|
129
|
+
\b (?: awd\.? | AWD )
|
|
130
|
+
## POSITIVE lookahead - requires space
|
|
131
|
+
(?= [ ])
|
|
132
|
+
)}x ## note - NOT case insensitive
|
|
133
|
+
|
|
134
|
+
###
|
|
135
|
+
# canc/canc. [cancelled]
|
|
136
|
+
# CANC
|
|
137
|
+
INLINE_CANC_RE = %r{
|
|
138
|
+
(?<inline_canc>
|
|
139
|
+
\b (?: canc\.? | CANC )
|
|
140
|
+
## POSITIVE lookahead - requires space
|
|
141
|
+
(?= [ ])
|
|
142
|
+
)}x ## note - NOT case insensitive
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
###
|
|
146
|
+
## home/away/neutral - (h), (a), (n)
|
|
147
|
+
## add support for h/a/n
|
|
148
|
+
## with (?-i \b [han] \b) lower-case and \b boundry - why? why not?
|
|
149
|
+
|
|
150
|
+
TEAM_HOME_RE = %r{ (?<team_home> \(h\) )}xi
|
|
151
|
+
TEAM_AWAY_RE = %r{ (?<team_away> \(a\) )}xi
|
|
152
|
+
TEAM_NEUTRAL_RE = %r{ (?<team_neutral> \(n\) )}xi
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
## "top-level" regex used for:
|
|
157
|
+
## - date_header
|
|
158
|
+
## - match_header & match_line_more
|
|
159
|
+
## - match_line
|
|
160
|
+
|
|
161
|
+
|
|
85
162
|
RE = Regexp.union(
|
|
86
|
-
STATUS_RE,
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
163
|
+
STATUS_RE, ## match status e.g. [cancelled], etc.
|
|
164
|
+
|
|
165
|
+
INLINE_WO_RE, ## (inline) match status - w/o (walkout)
|
|
166
|
+
INLINE_NP_RE, ## (inline) match status - n/p (not played)
|
|
167
|
+
INLINE_BYE_RE, ## (inline) match status - bye (advance to next round)
|
|
168
|
+
INLINE_ABD_RE, ## (inline) match status - abd/abd. (abandoned)
|
|
169
|
+
INLINE_SUSP_RE, ## (inline) match status - susp/susp. (suspended)
|
|
170
|
+
INLINE_PPD_RE, ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)
|
|
171
|
+
INLINE_VOID_RE, ## (inline) match status - x-x (voided)
|
|
172
|
+
INLINE_AWD_RE, ## (inline) match status - awd/awd. (awarded)
|
|
173
|
+
INLINE_CANC_RE, ## (inline) match status - canc/canc. (cancelled/canceled)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
TEAM_HOME_RE, ## (H)
|
|
177
|
+
TEAM_AWAY_RE, ## (A)
|
|
178
|
+
TEAM_NEUTRAL_RE, ## (N)
|
|
179
|
+
|
|
180
|
+
NOTE_RE, ### fix - change to INLINE_NOTE !!!
|
|
181
|
+
DATE_LEGS_RE, # note - must go before date!!!
|
|
90
182
|
DATE_RE, ## note - date must go before time (e.g. 12.12. vs 12.12)
|
|
91
183
|
TIME_RE,
|
|
92
|
-
|
|
93
|
-
|
|
184
|
+
ATTENDANCE_RE, # note - allow att: for now inline in matches too - why? why not?
|
|
185
|
+
SCORE_LEGS_RE,
|
|
186
|
+
SCORE_FULL_RE,
|
|
187
|
+
SCORE_FULLER_RE,
|
|
188
|
+
SCORE_FULLER_MORE_RE,
|
|
189
|
+
SCORE_AWD_RE, # (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc.
|
|
190
|
+
SCORE_ABD_RE, # (inline) score abandoned e.g. 2-1 abd.
|
|
191
|
+
SCORE_RE, ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!!
|
|
192
|
+
|
|
193
|
+
## note - add "experimental" "split" scores for now
|
|
194
|
+
SCORE_TEAM_RE, ## e.g. (2) 1 for "split" scores
|
|
195
|
+
SCORE_TEAM_PEN_RE, ## e.g. 1 (2)
|
|
196
|
+
|
|
94
197
|
BASICS_RE,
|
|
95
|
-
WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
|
|
96
|
-
# note - wday MUST be after text e.g. Sun Ke 68' is Sun Ke (NOT Sun) etc.
|
|
97
198
|
TEXT_RE,
|
|
199
|
+
## note - score_team_num (e.g. 0 or 10 etc.)
|
|
200
|
+
## MUST BE after TEXT
|
|
201
|
+
## only match if nothing else matches (expect ANY)
|
|
202
|
+
SCORE_TEAM_NUM_RE, ## e.g. 0 or 1 or 9 or 11 etc. (<100)
|
|
98
203
|
ANY_RE,
|
|
99
204
|
)
|
|
100
205
|
|
|
101
206
|
|
|
102
207
|
|
|
103
|
-
######################################################
|
|
104
|
-
## goal mode (switched to by PLAYER_WITH_MINUTE_RE)
|
|
105
208
|
|
|
106
|
-
GOAL_BASICS_RE = %r{
|
|
107
|
-
(?<spaces> [ ]{2,}) |
|
|
108
|
-
(?<space> [ ])
|
|
109
|
-
|
|
|
110
|
-
(?<sym>
|
|
111
|
-
[;,\[\]] ## add (-) dash too - why? why not?
|
|
112
|
-
)
|
|
113
|
-
}ix
|
|
114
209
|
|
|
115
210
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
211
|
+
## ord (for ordinal number)
|
|
212
|
+
## e.g. (51) or (1) etc. - limit digits of number - why? why not???
|
|
213
|
+
|
|
214
|
+
START_WITH_ORD = %r{
|
|
215
|
+
\A
|
|
216
|
+
[ ]* ## ignore leading spaces (if any)
|
|
217
|
+
(?<ord>
|
|
218
|
+
\(
|
|
219
|
+
(?<value>\d+)
|
|
220
|
+
\)
|
|
221
|
+
)}ix
|
|
124
222
|
|
|
125
|
-
## note - leave out n/a minute in goals - make minutes optional!!!
|
|
126
|
-
PROP_GOAL_RE = Regexp.union(
|
|
127
|
-
GOAL_BASICS_RE,
|
|
128
|
-
MINUTE_RE,
|
|
129
|
-
## MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now
|
|
130
|
-
GOAL_OG_RE, GOAL_PEN_RE,
|
|
131
|
-
SCORE_RE,
|
|
132
|
-
PROP_NAME_RE, ## note - (re)use prop name for now for (player) name
|
|
133
|
-
)
|
|
134
223
|
|
|
224
|
+
###
|
|
225
|
+
## e.g. 1930, 1986, 2002, 2010, 2022, 2026
|
|
226
|
+
## note - only YYYY
|
|
227
|
+
## note - look out for clubs like 1860 München (de) !!!
|
|
228
|
+
## 1899 Hoffenheim (de)
|
|
229
|
+
## 1896 Löwenherz (ch - a.k.a. FC Winterthur ??)
|
|
230
|
+
## any others starting with YYYY ?!
|
|
231
|
+
## note - YEAR requires TWO (trailing) spaces !!!!! e.g.
|
|
232
|
+
## 1930 Uruguay 4-2 Argentina
|
|
233
|
+
## 1934 Italy 2-1 Czechoslovakia (AET)
|
|
234
|
+
## 2022 Argentina 3-3 France (AET, 4-2 pen)
|
|
235
|
+
##
|
|
236
|
+
## do NOT match (iso date!!) - 2020-11-12
|
|
237
|
+
## 2020/11/12
|
|
238
|
+
## 2020.11.12 etc.
|
|
135
239
|
|
|
136
|
-
|
|
137
|
-
|
|
240
|
+
START_WITH_YEAR = %r{
|
|
241
|
+
\A
|
|
242
|
+
[ ]* ## ignore leading spaces (if any)
|
|
243
|
+
(?<year>
|
|
244
|
+
\d{4}
|
|
245
|
+
)
|
|
246
|
+
## positive lookahead
|
|
247
|
+
(?= [ ]{2} | ## min. TWO spaces or
|
|
248
|
+
[ ]@ | ## space with geo marker or
|
|
249
|
+
[ ]* \z ## year (date) header (end-of-line/string)
|
|
250
|
+
)
|
|
251
|
+
}x
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
###
|
|
256
|
+
## check for headings
|
|
257
|
+
## e.g. = heading 1
|
|
258
|
+
## == heading 2 etc.
|
|
259
|
+
## =Eurochampionship=
|
|
260
|
+
## note - no spaces required (same as in wikipedia!!)
|
|
261
|
+
## same as in wikipedia support six (6) levels
|
|
262
|
+
##
|
|
138
263
|
## note - use \A (instead of ^) - \A strictly matches the start of the string.
|
|
139
|
-
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
HEADING_RE = %r{ \A
|
|
140
267
|
[ ]* ## ignore leading spaces (if any)
|
|
141
|
-
(
|
|
142
|
-
[ ]
|
|
143
|
-
(?<
|
|
268
|
+
(?<heading_marker> ={1,6} )
|
|
269
|
+
[ ]*
|
|
270
|
+
(?<heading>
|
|
144
271
|
## must start with letter - why? why not?
|
|
145
272
|
### 1st round
|
|
146
273
|
## allow numbers e.g. Group A - 1
|
|
147
|
-
|
|
274
|
+
[^=]+? ## use non-greedy
|
|
148
275
|
)
|
|
149
|
-
[ ]* ## ignore trailing spaces (if any)
|
|
150
|
-
|
|
276
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
277
|
+
(?: =* ) ## allow any trailing heading markers
|
|
278
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
279
|
+
\z
|
|
151
280
|
}ix
|
|
152
281
|
|
|
153
282
|
|
|
283
|
+
HRULER_RE = %r{
|
|
284
|
+
\A
|
|
285
|
+
[ ]* ## ignore leading spaces (if any)
|
|
286
|
+
-{3,} ## must be at least three dashes!!!
|
|
287
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
288
|
+
\z
|
|
289
|
+
}ix
|
|
290
|
+
|
|
154
291
|
end # class Lexer
|
|
155
292
|
end # module SportDb
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
=begin
|
|
6
|
+
def self._mk_is( re )
|
|
7
|
+
## add \A ... \z to regex
|
|
8
|
+
## for strict matching of beginning and end of string
|
|
9
|
+
## regex note - \z will NOT allow trailing newline(s)!!!!
|
|
10
|
+
## note - must double espace \\A,\\z in quoted string!!
|
|
11
|
+
Regexp.new( %Q< \\A
|
|
12
|
+
(?:#{re.source})
|
|
13
|
+
\\z
|
|
14
|
+
>, re.options )
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
IS_TEAM_RE = _mk_is( TEXT_RE ) ## todo/fix - rename TEXT_RE to TEAM_RE!!!
|
|
19
|
+
IS_DATE_RE = _mk_is( DATE_IIII_RE ) ## DATE_RE )
|
|
20
|
+
=end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def self._parse_team( str )
|
|
24
|
+
## note - strip - leading/trailing spaces
|
|
25
|
+
m = TEXT_RE.match( str.strip )
|
|
26
|
+
if m && m.pre_match == '' && m.post_match == ''
|
|
27
|
+
m
|
|
28
|
+
elsif m
|
|
29
|
+
## note - match BUT not anchored to start and end-of-string!!!
|
|
30
|
+
## report, error somehow??
|
|
31
|
+
nil
|
|
32
|
+
else
|
|
33
|
+
nil ## no match - return nil
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def self._parse_date( str )
|
|
39
|
+
## note - strip - leading/trailing spaces
|
|
40
|
+
m = DATE_RE.match( str.strip )
|
|
41
|
+
|
|
42
|
+
#### todo/fix/check:
|
|
43
|
+
### wrapped with \A \z NOT working with union - check later - why?
|
|
44
|
+
### use hand-coded with pre_match = "" and post_match = ""
|
|
45
|
+
|
|
46
|
+
if m && m.pre_match == '' && m.post_match == ''
|
|
47
|
+
## return hash table with captured components
|
|
48
|
+
date = {}
|
|
49
|
+
## map month names
|
|
50
|
+
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
|
51
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
|
52
|
+
## check - use y too for two-digit year or keep separate - why? why not?
|
|
53
|
+
date[:yy] = m[:yy].to_i(10) if m[:yy] ## two digit year (e.g. 25 or 78 etc.)
|
|
54
|
+
date[:m] = m[:month].to_i(10) if m[:month]
|
|
55
|
+
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
|
56
|
+
date[:d] = m[:day].to_i(10) if m[:day]
|
|
57
|
+
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
|
58
|
+
date
|
|
59
|
+
elsif m
|
|
60
|
+
## note - match BUT not anchored to start and end-of-string!!!
|
|
61
|
+
## report, error somehow??
|
|
62
|
+
nil
|
|
63
|
+
else
|
|
64
|
+
nil ## no match - return nil
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def self._parse_score_full( str )
|
|
70
|
+
## note - strip - leading/trailing spaces
|
|
71
|
+
m=SCORE_FULL_RE.match( str )
|
|
72
|
+
|
|
73
|
+
if m && m.pre_match == '' && m.post_match == ''
|
|
74
|
+
score = {}
|
|
75
|
+
score[:p] = [m[:p1].to_i,m[:p2].to_i] if m[:p1] && m[:p2]
|
|
76
|
+
score[:et] = [m[:et1].to_i,m[:et2].to_i] if m[:et1] && m[:et2]
|
|
77
|
+
score[:ft] = [m[:ft1].to_i,m[:ft2].to_i] if m[:ft1] && m[:ft2]
|
|
78
|
+
score[:ht] = [m[:ht1].to_i,m[:ht2].to_i] if m[:ht1] && m[:ht2]
|
|
79
|
+
## score[:agg] = [m[:agg1].to_i,m[:agg2].to_i] if m[:agg1] && m[:agg2]
|
|
80
|
+
score
|
|
81
|
+
elsif m
|
|
82
|
+
## note - match BUT not anchored to start and end-of-string!!!
|
|
83
|
+
## report, error somehow??
|
|
84
|
+
nil
|
|
85
|
+
else
|
|
86
|
+
nil ## no match - return nil
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
end # class Lexer
|
|
92
|
+
end # module SportDb
|
data/lib/sportdb/parser.rb
CHANGED
|
@@ -15,17 +15,39 @@ require 'cocos'
|
|
|
15
15
|
|
|
16
16
|
require_relative 'parser/version'
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
##
|
|
19
|
+
## generic helper
|
|
20
|
+
require_relative 'parser/blocktxt'
|
|
21
|
+
|
|
22
|
+
## core machinery
|
|
19
23
|
|
|
20
24
|
require_relative 'parser/token-score'
|
|
25
|
+
require_relative 'parser/token-score_fuller'
|
|
26
|
+
require_relative 'parser/token-score_legs'
|
|
27
|
+
require_relative 'parser/token-time'
|
|
21
28
|
require_relative 'parser/token-date'
|
|
29
|
+
require_relative 'parser/token-date_duration'
|
|
22
30
|
require_relative 'parser/token-text'
|
|
31
|
+
require_relative 'parser/token-prop_name' ## a.k.a token-text_ii
|
|
23
32
|
require_relative 'parser/token-status'
|
|
24
|
-
require_relative 'parser/token-
|
|
33
|
+
require_relative 'parser/token-note'
|
|
34
|
+
require_relative 'parser/token-goals'
|
|
25
35
|
require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
|
|
26
36
|
require_relative 'parser/token-geo'
|
|
37
|
+
require_relative 'parser/token-group'
|
|
38
|
+
require_relative 'parser/token-round'
|
|
39
|
+
require_relative 'parser/token-table'
|
|
27
40
|
require_relative 'parser/token'
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
### add token ("private") parse helpers e.g. _parse_team() etc.
|
|
44
|
+
require_relative 'parser/token_helpers'
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
require_relative 'parser/lexer_buffer' ## incl. Tokens (aka TokenBuffer)
|
|
28
48
|
require_relative 'parser/lexer'
|
|
49
|
+
require_relative 'parser/lexer_tty' ## teletype (tty) mode
|
|
50
|
+
|
|
29
51
|
|
|
30
52
|
require_relative 'parser/parser' ## auto-generated by racc (from parser.y)
|
|
31
53
|
require_relative 'parser/racc_parser'
|