sportdb-parser 0.6.20 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +14 -8
- data/Rakefile +1 -1
- data/lib/sportdb/parser/blocktxt.rb +99 -0
- data/lib/sportdb/parser/lexer.rb +958 -395
- data/lib/sportdb/parser/lexer_buffer.rb +97 -0
- data/lib/sportdb/parser/lexer_tty.rb +111 -0
- data/lib/sportdb/parser/parser.rb +1768 -855
- data/lib/sportdb/parser/racc_parser.rb +1 -1
- data/lib/sportdb/parser/racc_tree.rb +327 -41
- data/lib/sportdb/parser/token-date.rb +160 -178
- data/lib/sportdb/parser/token-date_duration.rb +190 -0
- data/lib/sportdb/parser/token-geo.rb +59 -59
- data/lib/sportdb/parser/token-goals.rb +460 -0
- data/lib/sportdb/parser/token-group.rb +43 -0
- data/lib/sportdb/parser/token-note.rb +40 -0
- data/lib/sportdb/parser/token-prop.rb +70 -54
- data/lib/sportdb/parser/token-prop_name.rb +74 -0
- data/lib/sportdb/parser/token-round.rb +102 -0
- data/lib/sportdb/parser/token-score.rb +323 -47
- data/lib/sportdb/parser/token-score_fuller.rb +435 -0
- data/lib/sportdb/parser/token-score_legs.rb +59 -0
- data/lib/sportdb/parser/token-status.rb +157 -160
- data/lib/sportdb/parser/token-table.rb +149 -0
- data/lib/sportdb/parser/token-text.rb +72 -23
- data/lib/sportdb/parser/token-time.rb +141 -0
- data/lib/sportdb/parser/token.rb +242 -105
- data/lib/sportdb/parser/token_helpers.rb +92 -0
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +24 -2
- metadata +18 -18
- data/config/rounds_de.txt +0 -125
- data/config/rounds_en.txt +0 -29
- data/config/rounds_es.txt +0 -26
- data/config/rounds_misc.txt +0 -25
- data/config/rounds_pt.txt +0 -4
- data/config/zones_en.txt +0 -20
- data/lib/sportdb/parser/lang.rb +0 -298
- data/lib/sportdb/parser/token-minute.rb +0 -205
|
@@ -7,191 +7,188 @@ class Lexer
|
|
|
7
7
|
## add more variants - why? why not?
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
POSTPONED = %Q{ (?<postponed> postponed | pst\\.? | po?stp\\.? | ppd\\.? ) }
|
|
11
|
+
CANCELED = %Q{ (?<canceled> cancell?ed | canc\\.? ) } ## add can/can. - why? why not?
|
|
12
|
+
WALKOVER = %Q{ (?<walkover> walkover | w/o | wo ) } ## add o/w too - why? why not?
|
|
13
|
+
AWARDED = %Q{ (?<awarded> awarded | awd\\.? ) }
|
|
14
|
+
SUSPENDED = %Q{ (?<suspended> suspended | susp\\.? ) }
|
|
15
|
+
ABANDONED = %Q{ (?<abandoned> abandoned | aban\\.? | abd\\.? ) }
|
|
16
|
+
ANNULLED = %Q{ (?<annulled> annulled ) }
|
|
17
|
+
VOIDED = %Q{ (?<voided> voided | void ) } ### note - alternative (name) to annulled
|
|
18
|
+
|
|
19
|
+
REPLAY = %Q{ (?<replay> replay | repl\\.? ) }
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
## note - status_note incl. complete text incl. <status> (not normalized)
|
|
24
|
+
## <status> gets normalized e.g. ppt => postponed etc.
|
|
25
|
+
|
|
10
26
|
STATUS_RE = %r{
|
|
11
27
|
\[
|
|
12
28
|
(?:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
29
|
+
#############################################
|
|
30
|
+
### opt 1 - allow long forms with note/comment for some stati
|
|
31
|
+
## e.g. [postponed due to tropical storm "Hanna"]
|
|
32
|
+
## [suspended at 84' by storm; result stood]
|
|
33
|
+
#########################
|
|
34
|
+
(?: (?<status_note>
|
|
35
|
+
(?<status>
|
|
36
|
+
####################
|
|
37
|
+
## pre-match (not played)
|
|
38
|
+
#{POSTPONED}
|
|
39
|
+
|
|
|
40
|
+
#{CANCELED}
|
|
41
|
+
|
|
|
42
|
+
#{WALKOVER}
|
|
43
|
+
|
|
|
44
|
+
######################
|
|
45
|
+
## pre/post match
|
|
46
|
+
#{AWARDED}
|
|
18
47
|
|
|
|
19
|
-
|
|
48
|
+
########################
|
|
49
|
+
## post match - (partially) played
|
|
50
|
+
#{SUSPENDED}
|
|
51
|
+
|
|
|
52
|
+
#{ABANDONED}
|
|
20
53
|
|
|
|
21
|
-
|
|
22
|
-
## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
|
|
23
|
-
## [abandoned at 0-0 in 6' due to waterlogged pitch]
|
|
24
|
-
## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
|
|
25
|
-
## [abandoned at 1-0 in 31']
|
|
26
|
-
## [abandoned at 0-1' in 85 due to crowd trouble]
|
|
54
|
+
#{ANNULLED}
|
|
27
55
|
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
suspended
|
|
35
|
-
## e.g. [suspended at 0-0 in 12' due to storm]
|
|
36
|
-
## [suspended at 84' by storm; result stood]
|
|
37
|
-
|
|
|
38
|
-
verified
|
|
39
|
-
## e.g. [verified 2:0 wo.]
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
) [ ;,]* (?<status_note> [^\]]+ )
|
|
43
|
-
[ ]*
|
|
56
|
+
#{VOIDED} ### note - alternative to annulled
|
|
57
|
+
) ## end-of-<status>
|
|
58
|
+
[ :;,-]+ ## leading spaces (or separators)
|
|
59
|
+
[^\]]+? ## note - add non-greedy match
|
|
60
|
+
) ## end-of-<status-note>
|
|
61
|
+
[ ]* ## eat-up optional trailing spaces
|
|
44
62
|
)
|
|
45
|
-
|
|
|
46
|
-
|
|
47
|
-
|
|
63
|
+
|
|
|
64
|
+
########################################
|
|
65
|
+
## opt 2 - short form only (no note/comments) e.g. [postponed], [Canceled], etc.
|
|
66
|
+
####################################
|
|
48
67
|
(?<status>
|
|
49
|
-
|
|
68
|
+
####################
|
|
69
|
+
## pre-match (not played)
|
|
70
|
+
#{POSTPONED}
|
|
50
71
|
|
|
|
51
|
-
|
|
72
|
+
#{CANCELED}
|
|
52
73
|
|
|
|
53
|
-
|
|
74
|
+
#{WALKOVER}
|
|
54
75
|
|
|
|
55
|
-
|
|
76
|
+
######################
|
|
77
|
+
## pre/post match
|
|
78
|
+
#{AWARDED}
|
|
56
79
|
|
|
|
57
|
-
|
|
80
|
+
########################
|
|
81
|
+
## post match - (partially) played
|
|
82
|
+
#{SUSPENDED}
|
|
58
83
|
|
|
|
59
|
-
|
|
84
|
+
#{ABANDONED}
|
|
60
85
|
|
|
|
61
|
-
|
|
86
|
+
#{ANNULLED}
|
|
62
87
|
|
|
|
63
|
-
|
|
64
|
-
### move to note(s) - do NOT interpret as status - why? why not?
|
|
88
|
+
#{VOIDED} ### note - alternative to annulled
|
|
65
89
|
|
|
|
66
|
-
|
|
67
|
-
|
|
90
|
+
#{REPLAY} ### todo/fix - keep replay - why? why not?
|
|
91
|
+
### prefer replay in round e.g.
|
|
92
|
+
## ▪ Round 17, Replay
|
|
93
|
+
## ▪ Semi-finals, Replays
|
|
68
94
|
)
|
|
69
95
|
)
|
|
70
96
|
\]
|
|
71
97
|
}ix
|
|
72
98
|
|
|
73
99
|
|
|
100
|
+
def self._build_status( m )
|
|
101
|
+
status = {}
|
|
102
|
+
## note - norm status text - why? why not?
|
|
103
|
+
status[:status] = if m[:postponed] then 'postponed'
|
|
104
|
+
elsif m[:canceled] then 'canceled'
|
|
105
|
+
elsif m[:walkover] then 'walkover'
|
|
106
|
+
elsif m[:awarded] then 'awarded'
|
|
107
|
+
elsif m[:suspended] then 'suspended'
|
|
108
|
+
elsif m[:abandoned] then 'abandoned'
|
|
109
|
+
elsif m[:annulled] ||
|
|
110
|
+
m[:voided] then 'annulled'
|
|
111
|
+
elsif m[:replay] then 'replay'
|
|
112
|
+
else ## fallback on "generic" status (shouldn't happen)
|
|
113
|
+
m[:status]
|
|
114
|
+
end
|
|
74
115
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
(?<note>
|
|
82
|
-
(?: ## starting with ___ PLUS requiring more text
|
|
83
|
-
(?:
|
|
84
|
-
nb:
|
|
85
|
-
## e.g. [NB: between top-8 of regular season]
|
|
86
|
-
# [NB: América, Morelia and Tigres qualified on better record regular season]
|
|
87
|
-
# [NB: Celaya qualified on away goals]
|
|
88
|
-
# [NB: Alebrijes qualified on away goal]
|
|
89
|
-
# [NB: Leones Negros qualified on away goals]
|
|
90
|
-
#
|
|
91
|
-
# todo/fix:
|
|
92
|
-
# add "top-level" NB: version
|
|
93
|
-
## with full (end-of) line note - why? why not?
|
|
94
|
-
|
|
|
95
|
-
rescheduled
|
|
96
|
-
## e.g. [rescheduled due to earthquake occurred in Mexico on September 19]
|
|
97
|
-
|
|
|
98
|
-
declared
|
|
99
|
-
## e.g. [declared void]
|
|
100
|
-
|
|
|
101
|
-
remaining
|
|
102
|
-
## e.g. [remaining 79']
|
|
103
|
-
## [remaining 84']
|
|
104
|
-
## [remaining 59']
|
|
105
|
-
## [remaining 5']
|
|
106
|
-
)
|
|
107
|
-
[ ]
|
|
108
|
-
[^\]]+? ## slurp all to next ] - (use non-greedy)
|
|
109
|
-
)
|
|
110
|
-
)
|
|
111
|
-
\]
|
|
112
|
-
}ix
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
SCORE_NOTE_RE = %r{
|
|
117
|
-
\[
|
|
118
|
-
(?<score_note>
|
|
119
|
-
(?: # plain aet e.g. [aet]
|
|
120
|
-
aet | a\.e\.t\. |
|
|
121
|
-
after [ ] extra [ -] time
|
|
122
|
-
)
|
|
123
|
-
|
|
|
124
|
-
(?: # plain penalties e.g. [3-2 pen]
|
|
125
|
-
\d{1,2}-\d{1,2}
|
|
126
|
-
[ ]* (?: p|pen )
|
|
127
|
-
)
|
|
128
|
-
|
|
|
129
|
-
(?: # plain aet with penalties e.g. [aet; 4-3 pen] or [aet, 4-3p]
|
|
130
|
-
aet [ ]* [,;]
|
|
131
|
-
[ ]*
|
|
132
|
-
\d{1,2}-\d{1,2}
|
|
133
|
-
[ ]* (?: p|pen )
|
|
134
|
-
)
|
|
135
|
-
|
|
|
136
|
-
(?:
|
|
137
|
-
## e.g. Spain wins on penalties
|
|
138
|
-
## 1860 München wins on penalties etc.
|
|
139
|
-
## must start with digit 1-9 or letter
|
|
140
|
-
## todo - add more special chars - why? why not?
|
|
141
|
-
##
|
|
142
|
-
(?:
|
|
143
|
-
aet [ ]* ## allow space here - why? why not
|
|
144
|
-
[,;][ ]
|
|
145
|
-
)?
|
|
146
|
-
|
|
147
|
-
(?:
|
|
148
|
-
(?: # opt 1 - no team listed/named - requires score
|
|
149
|
-
(?: won|wins? ) [ ] ## note - allow won,win or wins
|
|
150
|
-
(?: ## score
|
|
151
|
-
\d{1,2}-\d{1,2}
|
|
152
|
-
[ ]
|
|
153
|
-
)
|
|
154
|
-
on [ ] (?: pens | penalties |
|
|
155
|
-
aggregate )
|
|
156
|
-
)
|
|
157
|
-
|
|
|
158
|
-
(?: # opt 2 - team required; score optional
|
|
159
|
-
(?: ## team required
|
|
160
|
-
[1-9\p{L}][0-9\p{L} .-]+?
|
|
161
|
-
[ ]
|
|
162
|
-
)
|
|
163
|
-
(?: won|wins? ) [ ] ## won/win/wins
|
|
164
|
-
(?: ## score optional
|
|
165
|
-
\d{1,2}-\d{1,2}
|
|
166
|
-
[ ]
|
|
167
|
-
)?
|
|
168
|
-
on [ ] (?: pens | penalties |
|
|
169
|
-
aggregate )
|
|
170
|
-
### [^\]]*? ## allow more? use non-greedy
|
|
171
|
-
)
|
|
172
|
-
))
|
|
173
|
-
|
|
|
174
|
-
(?: ## e.g. agg 3-2 etc.
|
|
175
|
-
agg [ ] \d{1,2}-\d{1,2}
|
|
176
|
-
)
|
|
177
|
-
|
|
|
178
|
-
(?: ## e.g. agg 4-4, Ajax win on away goals
|
|
179
|
-
(?: ## agg 4-4, optional for now - why? why not?
|
|
180
|
-
agg [ ] \d{1,2}-\d{1,2}
|
|
181
|
-
[ ]*[,;][ ]
|
|
182
|
-
)?
|
|
183
|
-
(?: ## team required
|
|
184
|
-
[1-9\p{L}][0-9\p{L} .-]+?
|
|
185
|
-
[ ]
|
|
186
|
-
)
|
|
187
|
-
(?: won|wins? ) [ ] # won/win/wins
|
|
188
|
-
on [ ] away [ ] goals
|
|
189
|
-
)
|
|
190
|
-
) # score_note ref
|
|
191
|
-
\]
|
|
192
|
-
}ix
|
|
116
|
+
## includes note? e.g. awarded; originally 2-0
|
|
117
|
+
status[:status_note] = m[:status_note] if m[:status_note]
|
|
118
|
+
|
|
119
|
+
status
|
|
120
|
+
end
|
|
121
|
+
def _build_status( m ) self.class._build_status( m ); end
|
|
193
122
|
|
|
194
123
|
|
|
195
124
|
end # class Lexer
|
|
196
125
|
end # module SportDb
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
__END__
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
####################
|
|
134
|
+
## pre-match (not played)
|
|
135
|
+
postponed|postp\.|ppd\.
|
|
136
|
+
## e.g. [postponed due to problems with the screen of the stadium]
|
|
137
|
+
## [postponed by storm]
|
|
138
|
+
## [postponed due to tropical storm "Hanna"]
|
|
139
|
+
## [postponed from Sep 10-12 due to death Queen Elizabeth II]
|
|
140
|
+
|
|
141
|
+
cancell?ed|canc.\
|
|
142
|
+
|
|
143
|
+
walkover|w/o|wo
|
|
144
|
+
## A victory awarded to one team because the opponent was unable
|
|
145
|
+
## or unwilling to compete (e.g., failing to show up or being disqualified).
|
|
146
|
+
## -or-
|
|
147
|
+
## A walkover or "win over" reveals when a team has won a game
|
|
148
|
+
## without it being played.
|
|
149
|
+
## -or-
|
|
150
|
+
## see <https://en.wikipedia.org/wiki/Walkover>
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
######################
|
|
155
|
+
## pre/post match
|
|
156
|
+
awarded|awd\.
|
|
157
|
+
|
|
158
|
+
## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
|
|
159
|
+
## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
|
|
160
|
+
## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
|
|
161
|
+
|
|
162
|
+
## A result that is decided by a governing body
|
|
163
|
+
## (like FIFA or a domestic league) rather than by the play on the pitch.
|
|
164
|
+
## Usually follows a Forfeit or Walkover.
|
|
165
|
+
## If a team refuses to play, abandons a match, or fields an ineligible player,
|
|
166
|
+
## the opponent is typically awarded a 3-0 victory.
|
|
167
|
+
|
|
168
|
+
########################
|
|
169
|
+
## post match - (partially) played
|
|
170
|
+
suspended|susp\.
|
|
171
|
+
|
|
172
|
+
## e.g. [suspended at 0-0 in 12' due to storm]
|
|
173
|
+
## [suspended at 84' by storm; result stood]
|
|
174
|
+
|
|
175
|
+
## The match is temporarily halted but intended to be resumed or restarted later.
|
|
176
|
+
|
|
177
|
+
abandoned|aban.\|abd\.
|
|
178
|
+
|
|
179
|
+
## e.g. [abandoned at 1-1 in 65' due to cardiac arrest
|
|
180
|
+
## Luton player Tom Lockyer]
|
|
181
|
+
## [abandoned at 0-0 in 6' due to waterlogged pitch]
|
|
182
|
+
## [abandoned at 5-0 in 80' due to attack
|
|
183
|
+
## on assistant referee by Cerro; result stood]
|
|
184
|
+
## [abandoned at 1-0 in 31']
|
|
185
|
+
## [abandoned at 0-1' in 85 due to crowd trouble]
|
|
186
|
+
|
|
187
|
+
## The match started but was stopped by the referee before the final whistle
|
|
188
|
+
## (e.g., due to a waterlogged pitch or player injury) and did not resume
|
|
189
|
+
|
|
190
|
+
annulled OR voided|void
|
|
191
|
+
## The match result is struck from the record entirely,
|
|
192
|
+
## usually due to a team's withdrawal from the league or a severe rule violation.
|
|
193
|
+
|
|
197
194
|
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
###
|
|
6
|
+
## check for
|
|
7
|
+
## table (standing) lines
|
|
8
|
+
##
|
|
9
|
+
## e.g.
|
|
10
|
+
##
|
|
11
|
+
## Pld W D L GF-GA Pts | d d d d-d d
|
|
12
|
+
## Pld GF-GA Pts | d d-d d
|
|
13
|
+
## Pld Pts W D L GF-GA | d d d d d d-d
|
|
14
|
+
##
|
|
15
|
+
## Pld = matches played
|
|
16
|
+
## GF-GA = goal for, goal against
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
## Pld W D L GF-GA Pts | d d d d-d d
|
|
20
|
+
##
|
|
21
|
+
## 1.BRAZIL 3 2 1 0 7- 2 7
|
|
22
|
+
## 2.MEXICO 3 2 1 0 4- 1 7
|
|
23
|
+
## 3.Croatia 3 1 0 2 6- 6 3
|
|
24
|
+
## 4.Cameroon 3 0 0 3 1- 9 0
|
|
25
|
+
|
|
26
|
+
## add more headings?? e.g.
|
|
27
|
+
## Final Table:
|
|
28
|
+
##
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
TABLE_HEADING_I_RE = %r{
|
|
32
|
+
\A
|
|
33
|
+
[ ]* ## ignore leading spaces (if any)
|
|
34
|
+
(?<table_heading>
|
|
35
|
+
\b
|
|
36
|
+
P(?:ld)? [ ]+
|
|
37
|
+
W [ ]+
|
|
38
|
+
D [ ]+
|
|
39
|
+
L [ ]+
|
|
40
|
+
Gls [ ]+
|
|
41
|
+
Pts
|
|
42
|
+
\b
|
|
43
|
+
)
|
|
44
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
45
|
+
\z
|
|
46
|
+
}xi
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
##
|
|
50
|
+
## "solid"-style
|
|
51
|
+
## -----------------------------------------------------
|
|
52
|
+
## "dashed"-style ??
|
|
53
|
+
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
TABLE_DIVIDER_RE = %r{
|
|
57
|
+
\A
|
|
58
|
+
[ ]* ## ignore leading spaces (if any)
|
|
59
|
+
(?<table_divider>
|
|
60
|
+
(?: --- ## note - require three dashes minimum (---)
|
|
61
|
+
[-]*
|
|
62
|
+
)
|
|
63
|
+
|
|
|
64
|
+
(?: - [ ]+ - [ ]+ - ## note - require three dashes minimum (- - -)
|
|
65
|
+
(?: [ ]+ -)* ## todo/check - restrict spaces to 2 or 3 or such - why? why not?
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
69
|
+
\z
|
|
70
|
+
}xi
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
####
|
|
74
|
+
## 1.SOLOMON I. 1 1 0 0 3- 1 3
|
|
75
|
+
## 2.TAHITI 1 0 0 1 1- 3 0
|
|
76
|
+
## -.Cook Islands withdrew after first match (annulled) due to Covid-19 outbreak in squad
|
|
77
|
+
## -.Vanuatu withdrew before playing any matches due to Covid-19 outbreak in squad -->
|
|
78
|
+
##
|
|
79
|
+
## note - starting with -. is a table note!!!
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
TABLE_NOTE_RE = %r{
|
|
83
|
+
\A
|
|
84
|
+
[ ]* ## ignore leading spaces (if any)
|
|
85
|
+
-\.
|
|
86
|
+
[ ]*
|
|
87
|
+
(?<table_note>
|
|
88
|
+
.+? ## note - use non-greedy
|
|
89
|
+
)
|
|
90
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
91
|
+
\z
|
|
92
|
+
}xi
|
|
93
|
+
|
|
94
|
+
TABLE_I_RE = %r{
|
|
95
|
+
(?<table>\b
|
|
96
|
+
\d{1,2} [ ]+ # Pld
|
|
97
|
+
\d{1,2} [ ]+ # W
|
|
98
|
+
\d{1,2} [ ]+ # D
|
|
99
|
+
\d{1,2} [ ]+ # L
|
|
100
|
+
(?: \d{1,3} - [ ]* \d{1,3} [ ]+ ) # GF-GA
|
|
101
|
+
\d{1,3} # Pts
|
|
102
|
+
\b
|
|
103
|
+
)}xi
|
|
104
|
+
|
|
105
|
+
## Pld Pts W D L GF-GA | d d d d d d-d
|
|
106
|
+
##
|
|
107
|
+
## 1. ARG^ 3 6 3 0 0 10-4
|
|
108
|
+
## 2. CHI 3 4 2 0 1 5-3
|
|
109
|
+
## 3. FRA 3 2 1 0 2 4-3
|
|
110
|
+
## 4. MEX 3 0 0 0 3 4-13
|
|
111
|
+
|
|
112
|
+
TABLE_II_RE = %r{
|
|
113
|
+
(?<table>\b
|
|
114
|
+
\d{1,2} [ ]+ # Pld
|
|
115
|
+
\d{1,3} [ ]+ # Pts
|
|
116
|
+
\d{1,2} [ ]+ # W
|
|
117
|
+
\d{1,2} [ ]+ # D
|
|
118
|
+
\d{1,2} [ ]+ # L
|
|
119
|
+
(?: \d{1,3} - [ ]* \d{1,3}) # GF-GA
|
|
120
|
+
\b
|
|
121
|
+
)}xi
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
#############################################
|
|
126
|
+
# map tables
|
|
127
|
+
# note: order matters; first come-first matched/served
|
|
128
|
+
|
|
129
|
+
## possible start lines for a table
|
|
130
|
+
## excludes NOTE
|
|
131
|
+
## and RULER (e.g. --- or) or such in the future
|
|
132
|
+
TABLE_RE = Regexp.union(
|
|
133
|
+
TABLE_HEADING_I_RE,
|
|
134
|
+
TABLE_I_RE,
|
|
135
|
+
TABLE_II_RE,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
## all possible continuation for a table
|
|
139
|
+
## excludes HEADING
|
|
140
|
+
TABLE_MORE_RE = Regexp.union(
|
|
141
|
+
TABLE_NOTE_RE,
|
|
142
|
+
TABLE_DIVIDER_RE,
|
|
143
|
+
TABLE_I_RE,
|
|
144
|
+
TABLE_II_RE,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
end # class Lexer
|
|
149
|
+
end # module SportDb
|
|
@@ -2,6 +2,11 @@ module SportDb
|
|
|
2
2
|
class Lexer
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
##
|
|
6
|
+
## todo/fix - change TEXT_RE to TEAM_RE !!!!
|
|
7
|
+
## do NOT use (anymore) as generic TEXT_RE
|
|
8
|
+
|
|
9
|
+
|
|
5
10
|
|
|
6
11
|
## todo - use ANY_RE to token_commons or such - for shared by many?
|
|
7
12
|
|
|
@@ -13,6 +18,14 @@ ANY_RE = %r{
|
|
|
13
18
|
|
|
14
19
|
|
|
15
20
|
|
|
21
|
+
## note - TEXT_RE used for TEAM_NAMES
|
|
22
|
+
## plus as "legacy" shortcut for (simple) group or round names e.g.
|
|
23
|
+
## Group A, Group 1, ..
|
|
24
|
+
## Matchday 1, 1. Round,
|
|
25
|
+
## note - no exception for (shortcut) group or round (MUST match team name pattern!)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
16
29
|
## note - do NOT allow single alpha text for now
|
|
17
30
|
## add later?? A - B C - D - why?
|
|
18
31
|
## opt 1) one alpha
|
|
@@ -24,11 +37,13 @@ ANY_RE = %r{
|
|
|
24
37
|
### allow special case - starting text with number e.g.
|
|
25
38
|
## number must be follow by space or dot ()
|
|
26
39
|
# 1 FC ## allow 1-FC or 1FC - why? why not?
|
|
40
|
+
# 1FC"
|
|
27
41
|
# 1. FC
|
|
28
|
-
# 1.FC
|
|
29
|
-
#
|
|
42
|
+
# 1.FC
|
|
43
|
+
# 23° Noviembre
|
|
30
44
|
# 1890 Munich
|
|
31
|
-
#
|
|
45
|
+
# 1-FC - XXXX - not allowed for now, parse error
|
|
46
|
+
# 1/FC - XXXX - not allowed for now
|
|
32
47
|
|
|
33
48
|
|
|
34
49
|
##
|
|
@@ -61,28 +76,37 @@ TEXT_RE = %r{
|
|
|
61
76
|
## MUST be followed by (optional dot) and
|
|
62
77
|
## required space !!!
|
|
63
78
|
## MUST be follow by a to z!!!!
|
|
64
|
-
|
|
79
|
+
[.°]? ## optional dot (.) or degree(°) - todo - add number sign too!!
|
|
65
80
|
[ ]? ## make space optional too - why? why not?
|
|
66
81
|
## yes - eg. 1st, 2nd, 5th etc.
|
|
67
82
|
\p{L}+
|
|
68
83
|
|
|
|
69
|
-
## opt 3 - add weirdo case
|
|
70
|
-
## e.g. 1/8 Finals 1/4 1/2 ...
|
|
71
|
-
1/ \d{1,2} [ ] \p{L}+
|
|
72
|
-
|
|
|
73
|
-
## opt 4 - add another weirdo case
|
|
84
|
+
## opt 3 - add another weirdo case
|
|
74
85
|
## e.g. 's Gravenwezel-Schilde
|
|
75
|
-
'[s]
|
|
76
|
-
|
|
|
77
|
-
## opt 5 - add another weirdo case
|
|
78
|
-
## e.g. 5.-8. Platz Playoffs - keep - why? why not?
|
|
79
|
-
\d+\.-\d+\. [ ]? \p{L}+
|
|
86
|
+
'[s] [ ] \p{L}+
|
|
80
87
|
)
|
|
81
88
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
89
|
+
|
|
90
|
+
(?:(?: (?:[ ] # only single spaces allowed inline!!!
|
|
91
|
+
## note - exclude (v[ ]/vs[ ]/vs.[ ])
|
|
92
|
+
## AND switch to case-sensitive (via -i!!!)
|
|
93
|
+
(?! (?-i: (?: ## note - (big) V not matching for versus!!!
|
|
94
|
+
vs\.?|v|VS|
|
|
95
|
+
|
|
96
|
+
n/p|N/P|
|
|
97
|
+
w/o|W/O|
|
|
98
|
+
abd\.?|ABD|
|
|
99
|
+
aban\.?|ABAN|
|
|
100
|
+
susp\.?|SUSP|
|
|
101
|
+
ppd\.?|PPD|
|
|
102
|
+
pst\.?|PST|
|
|
103
|
+
po?stp\.?|PO?STP|P-P|
|
|
104
|
+
x-x|X-X|
|
|
105
|
+
awd\.?|AWD|
|
|
106
|
+
canc\.?|CANC ) [ ]
|
|
107
|
+
|
|
|
108
|
+
(?: bye|BYE ) (?:[ ]|$))
|
|
109
|
+
)
|
|
86
110
|
)
|
|
87
111
|
|
|
|
88
112
|
[/-] ## must NOT be surrounded by spaces
|
|
@@ -90,7 +114,13 @@ TEXT_RE = %r{
|
|
|
90
114
|
(?:
|
|
91
115
|
\p{L}
|
|
92
116
|
|
|
|
93
|
-
[.&'
|
|
117
|
+
(?: ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.)
|
|
118
|
+
\. (?! \.) ## allow single points only (now two or more etc.)
|
|
119
|
+
|
|
|
120
|
+
& (?! &)
|
|
121
|
+
|
|
|
122
|
+
' (?! ')
|
|
123
|
+
)
|
|
94
124
|
|
|
|
95
125
|
(?:
|
|
96
126
|
\d+
|
|
@@ -98,8 +128,9 @@ TEXT_RE = %r{
|
|
|
98
128
|
[0-9h'+] | ## protected break on 12h / 12' / 1-1
|
|
99
129
|
## check usege for 3+4 - possible? where ? why?
|
|
100
130
|
(?:[.:-]\d) ## protected/exclude/break on 12.03 / 12:03 / 12-12
|
|
101
|
-
|
|
131
|
+
## BUT allow Park21-Arena for example e.g. 21-A :-)
|
|
102
132
|
)
|
|
133
|
+
[°]? ## followed by optional ord
|
|
103
134
|
## negative lookahead for numbers
|
|
104
135
|
## note - include digits itself!!!
|
|
105
136
|
## note - remove / (slash) e.g. allows UDI'19/Beter Bed
|
|
@@ -140,11 +171,29 @@ TEXT_RE = %r{
|
|
|
140
171
|
\)
|
|
141
172
|
)?
|
|
142
173
|
(?:
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
174
|
+
######
|
|
175
|
+
# check for country code (cc)
|
|
176
|
+
# e.g. (AUT) or ,AUT or AUT
|
|
177
|
+
(?:
|
|
178
|
+
[ ] ## note - do NOT allow more than one space!!! - why? why not?
|
|
179
|
+
\(
|
|
180
|
+
## note - auto-exclude reserved (aet) from SCORE_FULLER_MORE!!!
|
|
181
|
+
## plus golden goal (gg)/sudden death (sd), silver goal (sg)
|
|
182
|
+
## (ht), (ft)
|
|
183
|
+
(?! (?: aet | agget | asdet | asget | ht | ft )
|
|
184
|
+
\)
|
|
185
|
+
)
|
|
186
|
+
(?:
|
|
187
|
+
[A-Z]{1,5}
|
|
146
188
|
)
|
|
147
189
|
\)
|
|
190
|
+
)
|
|
191
|
+
|
|
|
192
|
+
(?:
|
|
193
|
+
[ ]*[,›>][ ]*
|
|
194
|
+
[A-Z]{1,5}
|
|
195
|
+
\b
|
|
196
|
+
)
|
|
148
197
|
)?
|
|
149
198
|
## add lookahead/lookbehind
|
|
150
199
|
## must be space!!!
|