sportdb-parser 0.6.20 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +14 -8
- data/Rakefile +1 -1
- data/lib/sportdb/parser/blocktxt.rb +99 -0
- data/lib/sportdb/parser/lexer.rb +958 -395
- data/lib/sportdb/parser/lexer_buffer.rb +97 -0
- data/lib/sportdb/parser/lexer_tty.rb +111 -0
- data/lib/sportdb/parser/parser.rb +1768 -855
- data/lib/sportdb/parser/racc_parser.rb +1 -1
- data/lib/sportdb/parser/racc_tree.rb +327 -41
- data/lib/sportdb/parser/token-date.rb +160 -178
- data/lib/sportdb/parser/token-date_duration.rb +190 -0
- data/lib/sportdb/parser/token-geo.rb +59 -59
- data/lib/sportdb/parser/token-goals.rb +460 -0
- data/lib/sportdb/parser/token-group.rb +43 -0
- data/lib/sportdb/parser/token-note.rb +40 -0
- data/lib/sportdb/parser/token-prop.rb +70 -54
- data/lib/sportdb/parser/token-prop_name.rb +74 -0
- data/lib/sportdb/parser/token-round.rb +102 -0
- data/lib/sportdb/parser/token-score.rb +323 -47
- data/lib/sportdb/parser/token-score_fuller.rb +435 -0
- data/lib/sportdb/parser/token-score_legs.rb +59 -0
- data/lib/sportdb/parser/token-status.rb +157 -160
- data/lib/sportdb/parser/token-table.rb +149 -0
- data/lib/sportdb/parser/token-text.rb +72 -23
- data/lib/sportdb/parser/token-time.rb +141 -0
- data/lib/sportdb/parser/token.rb +242 -105
- data/lib/sportdb/parser/token_helpers.rb +92 -0
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +24 -2
- metadata +18 -18
- data/config/rounds_de.txt +0 -125
- data/config/rounds_en.txt +0 -29
- data/config/rounds_es.txt +0 -26
- data/config/rounds_misc.txt +0 -25
- data/config/rounds_pt.txt +0 -4
- data/config/zones_en.txt +0 -20
- data/lib/sportdb/parser/lang.rb +0 -298
- data/lib/sportdb/parser/token-minute.rb +0 -205
data/lib/sportdb/parser/lexer.rb
CHANGED
|
@@ -25,194 +25,386 @@ end
|
|
|
25
25
|
## for now for compatibility
|
|
26
26
|
def is_group?( text ) Lang.is_group?( text ); end
|
|
27
27
|
def is_round?( text ) Lang.is_round?( text ); end
|
|
28
|
-
|
|
29
|
-
def is_zone?( text ) Lang.is_zone?( text ); end
|
|
28
|
+
|
|
30
29
|
|
|
31
|
-
## transforms
|
|
32
|
-
##
|
|
33
|
-
## Netherlands 1-2 (1-1) England
|
|
34
|
-
## => text => team
|
|
35
|
-
## score|vs
|
|
36
|
-
## text => team
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
## token iter/find better name
|
|
40
|
-
## e.g. TokenBuffer/Scanner or such ??
|
|
41
|
-
class Tokens
|
|
42
|
-
def initialize( tokens )
|
|
43
|
-
@tokens = tokens
|
|
44
|
-
@pos = 0
|
|
45
|
-
end
|
|
46
30
|
|
|
47
|
-
def pos() @pos; end
|
|
48
|
-
def eos?() @pos >= @tokens.size; end
|
|
49
31
|
|
|
32
|
+
def debug?() @debug == true; end
|
|
50
33
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
end
|
|
58
|
-
false
|
|
59
|
-
end
|
|
34
|
+
def initialize( lines, debug: false )
|
|
35
|
+
raise ArgumentError, "(string) text expected for lexer; got #{lines.class.name}" unless lines.is_a?(String)
|
|
36
|
+
|
|
37
|
+
@debug = debug
|
|
38
|
+
@txt = lines
|
|
39
|
+
end
|
|
60
40
|
|
|
61
|
-
## pattern e.g. [:TEXT, [:VS,:SCORE], :TEXT]
|
|
62
|
-
def match?( *pattern )
|
|
63
|
-
## puts " starting match? #{pattern.inspect} @ #{@pos}"
|
|
64
|
-
pattern.each_with_index do |types,offset|
|
|
65
|
-
## if single symbol wrap in array
|
|
66
|
-
types = types.is_a?(Array) ? types : [types]
|
|
67
|
-
return false unless types.include?( peek(offset) )
|
|
68
|
-
end
|
|
69
|
-
true
|
|
70
|
-
end
|
|
71
41
|
|
|
42
|
+
HTML_COMMENT_RE = %r{ <!--
|
|
43
|
+
.*? ## note - use non-greedy/lazy *? match
|
|
44
|
+
-->
|
|
45
|
+
}xm ## note - turn on multi-line match (for dot (.))
|
|
72
46
|
|
|
73
|
-
## return token type (e.g. :TEXT, :NUM, etc.)
|
|
74
|
-
def cur() peek(0); end
|
|
75
|
-
## return content (assumed to be text)
|
|
76
|
-
def text(offset=0)
|
|
77
|
-
## raise error - why? why not?
|
|
78
|
-
## return nil?
|
|
79
|
-
if peek( offset ) != :text
|
|
80
|
-
raise ArgumentError, "text(#{offset}) - token not a text type"
|
|
81
|
-
end
|
|
82
|
-
@tokens[@pos+offset][1]
|
|
83
|
-
end
|
|
84
47
|
|
|
48
|
+
##
|
|
49
|
+
## note - [] block may NOT incl. square brackets
|
|
50
|
+
## what about comments (e.g. #)?
|
|
51
|
+
## todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
|
|
52
|
+
PREPROC_BLOCK_RE = %r{ \[
|
|
53
|
+
[^\[\]\#]*? ## note - use non-greedy/lazy *? match
|
|
54
|
+
\]
|
|
55
|
+
}xm ## note - turn on multi-line match (for dot(.))
|
|
85
56
|
|
|
86
|
-
def peek(offset=1)
|
|
87
|
-
## return nil if eos
|
|
88
|
-
if @pos+offset >= @tokens.size
|
|
89
|
-
nil
|
|
90
|
-
else
|
|
91
|
-
@tokens[@pos+offset][0]
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
57
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
58
|
+
##
|
|
59
|
+
## check for "literal" (multi-line) note blocks
|
|
60
|
+
## eg. nb: or note:
|
|
61
|
+
## space required after double colon - why? why not?
|
|
62
|
+
PREPROC_NOTA_BENE_RE = %r{
|
|
63
|
+
^
|
|
64
|
+
[ ]* (?: nb | note) [ ]* : [ ]+
|
|
65
|
+
.+? ## non-greedy
|
|
66
|
+
|
|
67
|
+
## positive lookahead
|
|
68
|
+
## note - must end with blank line or end-of-file/document
|
|
69
|
+
## note - do NOT eat-up trailing hrule (---)
|
|
70
|
+
(?= (?: \n [ ]* -{3,} [ ]*)?
|
|
71
|
+
\n[ ]*\n
|
|
72
|
+
| \z
|
|
73
|
+
)
|
|
74
|
+
}xim
|
|
101
75
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
76
|
+
##
|
|
77
|
+
## replace "escaped" newline with non-newline char e.g. '↵'
|
|
78
|
+
LINE_CONTINUATION_RE = %r{
|
|
79
|
+
\\[ ]* \n
|
|
80
|
+
}x
|
|
106
81
|
|
|
107
|
-
def collect( &blk )
|
|
108
|
-
tokens = []
|
|
109
|
-
loop do
|
|
110
|
-
break if eos?
|
|
111
|
-
tokens << if block_given?
|
|
112
|
-
blk.call( self.next )
|
|
113
|
-
else
|
|
114
|
-
self.next
|
|
115
|
-
end
|
|
116
|
-
end
|
|
117
|
-
tokens
|
|
118
|
-
end
|
|
119
|
-
end # class Tokens
|
|
120
82
|
|
|
121
83
|
|
|
84
|
+
###
|
|
85
|
+
## check for magic comments
|
|
86
|
+
## e.g # teletype: true or TELETYPE: TRUE
|
|
87
|
+
## tty/teletype
|
|
88
|
+
|
|
89
|
+
MAGIC_COMMENT_RE = %r{ \A
|
|
90
|
+
[ ]* ## optional leading spaces
|
|
91
|
+
\#+ ## note - allow ##,###, etc. too
|
|
92
|
+
[ ]* ## optional spaces
|
|
93
|
+
(?<magic_comment_key> tty | teletype )
|
|
94
|
+
[ ]* ## optional spaces
|
|
95
|
+
:
|
|
96
|
+
[ ]* ## optional spaces
|
|
97
|
+
(?<magic_comment_value> true | false )
|
|
98
|
+
[ ]* ## optional trailing spaces
|
|
99
|
+
\z
|
|
100
|
+
}ix
|
|
101
|
+
|
|
122
102
|
|
|
123
103
|
|
|
124
|
-
def debug?() @debug == true; end
|
|
125
104
|
|
|
126
|
-
def initialize( lines, debug: false )
|
|
127
|
-
@debug = debug
|
|
128
105
|
|
|
129
|
-
## note - for convenience - add support
|
|
130
|
-
## comments (incl. inline end-of-line comments) and empty lines here
|
|
131
|
-
## why? why not?
|
|
132
|
-
## why? keeps handling "centralized" here in one place
|
|
133
106
|
|
|
134
|
-
|
|
135
|
-
|
|
107
|
+
def tokenize_with_errors
|
|
108
|
+
|
|
109
|
+
####
|
|
110
|
+
## flags / modes
|
|
111
|
+
@teletype = false # use magic comment - tty/teletype: true
|
|
136
112
|
|
|
137
|
-
txt_pre = if lines.is_a?( Array )
|
|
138
|
-
## join together with newline
|
|
139
|
-
lines.reduce( String.new ) do |mem,line|
|
|
140
|
-
mem << line; mem << "\n"; mem
|
|
141
|
-
end
|
|
142
|
-
else ## assume single-all-in-one txt
|
|
143
|
-
lines
|
|
144
|
-
end
|
|
145
113
|
|
|
146
|
-
|
|
114
|
+
|
|
115
|
+
tokens_by_line = [] ## note: add tokens line-by-line (flatten later)
|
|
116
|
+
errors = [] ## keep a list of errors - why? why not?
|
|
117
|
+
|
|
118
|
+
## preprocess automagically - why? why not?
|
|
147
119
|
## strip lines with comments and empty lines striped / removed
|
|
148
120
|
## keep empty lines? why? why not?
|
|
149
121
|
## keep leading spaces (indent) - why?
|
|
150
122
|
##
|
|
151
123
|
## note - KEEP empty lines (get turned into BLANK token!!!!)
|
|
152
124
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
125
|
+
|
|
126
|
+
## "universal" newlines
|
|
127
|
+
## replace all windows-style cr+lf (\r\n) to lf (\n) only
|
|
128
|
+
txt = @txt.gsub( "\r\n", "\n" )
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
###
|
|
133
|
+
## quick hack for now
|
|
134
|
+
## remove html-style comments <!-- -->
|
|
135
|
+
## (incl. multi-line) with two spaces
|
|
136
|
+
## will mess-up lineno tracking!!!
|
|
137
|
+
## fix later to have function lineno & colno!!!
|
|
138
|
+
txt = @txt.gsub( HTML_COMMENT_RE ) do |m|
|
|
139
|
+
puts " [debug] preproc html comment:"
|
|
140
|
+
puts m
|
|
141
|
+
' '
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
=begin
|
|
146
|
+
##
|
|
147
|
+
## todo/fix - add a command line switch/option for auto-format fixes !!!
|
|
148
|
+
## quick hack - remove later
|
|
149
|
+
## auto-convert "old" legacy round markers (»)
|
|
150
|
+
txt = txt.gsub( %r{^ [ ]*
|
|
151
|
+
»
|
|
152
|
+
(?= [ ]+) ## require one trailing space for now!!
|
|
153
|
+
}ix ) do |_|
|
|
154
|
+
puts "!! WARN - auto-fix format; replacing old (alternate/legacy) round marker (»)"
|
|
155
|
+
'▪'
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
### 16.00 => 16:00
|
|
160
|
+
## todo/check - use space for positive lookbehind & ahead
|
|
161
|
+
## (instead of \b) - why? why not?
|
|
162
|
+
## note - check for/exclude 12.12. date in match
|
|
163
|
+
## use negative lookahead
|
|
164
|
+
## check for 12.12.94
|
|
165
|
+
## use positive lookbehind !!!
|
|
166
|
+
## must be space, comma or begin-of-line [ ,]|^
|
|
167
|
+
## or use negative lookbehind
|
|
168
|
+
## must NOT be dot
|
|
169
|
+
txt = txt.gsub( %r{
|
|
170
|
+
## check NEGATIVE lookbehind
|
|
171
|
+
(?<! [.]) ## do NOT match 12.94 in 12.12.94
|
|
172
|
+
\b
|
|
173
|
+
(?<h>\d{1,2})
|
|
174
|
+
\.
|
|
175
|
+
(?<m>\d{2})
|
|
176
|
+
\b
|
|
177
|
+
(?! [.] ) ## do NOT match 12.12.
|
|
178
|
+
}ix ) do |_|
|
|
179
|
+
m = $~ ## is $LAST_MATCH_DATA
|
|
180
|
+
puts "!! WARN - auto-fix format; replacing old (alternate/legacy) time format #{m[0]}"
|
|
181
|
+
"#{m[:h]}:#{m[:m]}" ## '\1:\2'
|
|
182
|
+
end
|
|
183
|
+
=end
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
###
|
|
189
|
+
## add more "native" multi-line comment-styles
|
|
190
|
+
## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >>
|
|
191
|
+
## or such - why? why not?
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
|
|
195
|
+
if m.include?( "\n" ) ## check for newlines (\n) and replace
|
|
196
|
+
puts " [debug] preproc (multi-line) note/nota bene block:"
|
|
197
|
+
puts m
|
|
198
|
+
## todo/check: replace with two spaces insead of ↵ - why? why not?
|
|
199
|
+
m.gsub( "\n", '↵' )
|
|
200
|
+
else
|
|
201
|
+
m
|
|
202
|
+
end
|
|
162
203
|
end
|
|
163
|
-
end
|
|
164
204
|
|
|
165
205
|
|
|
206
|
+
##
|
|
207
|
+
## e.g. used in (multi-line) TableNote
|
|
208
|
+
## 1.SOUTH KOREA 6 5 1 0 22- 1 16 [0-0]
|
|
209
|
+
## 2.LEBANON 6 3 1 2 11- 8 10 [0-2, 0-0]
|
|
210
|
+
## 3.Turkmenistan 6 3 0 3 8-11 9 [3-1]
|
|
211
|
+
## 4.Sri Lanka 6 0 0 6 2-23 0 [0-1]
|
|
212
|
+
## -.North Korea [withdrew after playing 5 matches due to safety concerns in
|
|
213
|
+
## connection with the Covid-19 pandemic; all results annulled]
|
|
214
|
+
##
|
|
215
|
+
## note - no longer used for now
|
|
216
|
+
## enclose multi-line notes in []
|
|
217
|
+
## removes need for line continuation for now
|
|
166
218
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
219
|
+
##
|
|
220
|
+
## txt = txt.gsub( LINE_CONTINUATION_RE ) do |_|
|
|
221
|
+
## puts " [debug] preproc line continuation"
|
|
222
|
+
## ## todo/check: replace with two spaces insead of ↵ - why? why not?
|
|
223
|
+
## '↵'
|
|
224
|
+
## end
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
#####
|
|
229
|
+
## (another) quick hack for now
|
|
230
|
+
## turn multi-line note blocks into
|
|
231
|
+
## single-line note blocks
|
|
232
|
+
## by changing newline (\n) to ⏎ (unicode U+23CE)
|
|
233
|
+
## or why not to ___ ?
|
|
234
|
+
##
|
|
235
|
+
## unicode options for return/arrows:
|
|
236
|
+
## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
|
|
237
|
+
## This is the most common "carriage return" symbol.
|
|
238
|
+
## - ⏎ (U+23CE): Return Symbol.
|
|
239
|
+
## Specifically designated as the keyboard's "Return" key symbol,
|
|
240
|
+
## often used in user interfaces.
|
|
241
|
+
|
|
242
|
+
txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
|
|
243
|
+
if m.include?( "\n" ) ## check for newlines (\n) and replace
|
|
244
|
+
puts " [debug] preproc (multi-line) block:"
|
|
245
|
+
puts m
|
|
246
|
+
## todo/check: replace with two spaces insead of ↵ - why? why not?
|
|
247
|
+
m.gsub( "\n", '↵' )
|
|
248
|
+
else
|
|
249
|
+
m
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
####
|
|
255
|
+
## quick hack - keep re state/mode between tokenize calls!!!
|
|
256
|
+
@re ||= RE ## note - switch between RE & INSIDE_RE
|
|
170
257
|
|
|
171
|
-
@txt.each_line do |line|
|
|
172
|
-
line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!!
|
|
173
|
-
|
|
174
|
-
more_tokens, more_errors = _tokenize_line( line )
|
|
175
|
-
|
|
176
|
-
tokens_by_line << more_tokens
|
|
177
|
-
errors += more_errors
|
|
178
|
-
end # each line
|
|
179
258
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
##
|
|
183
|
-
|
|
184
|
-
|
|
259
|
+
txt.each_line do |line|
|
|
260
|
+
## line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!!
|
|
261
|
+
line = line.strip ## note - strip leading AND trailing whitespaces
|
|
262
|
+
## note - trailing whitespace may incl. \n or \r\n!!!
|
|
263
|
+
|
|
264
|
+
|
|
185
265
|
##
|
|
186
|
-
|
|
187
|
-
##
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
266
|
+
###
|
|
267
|
+
## check for magic comments
|
|
268
|
+
## e.g # teletype: true or TELETYPE: TRUE
|
|
269
|
+
## tty/teletype
|
|
270
|
+
|
|
271
|
+
if line.start_with?('#') ### skip comments (& check magic comments!!)
|
|
272
|
+
|
|
273
|
+
if (m = MAGIC_COMMENT_RE.match(line))
|
|
274
|
+
magic_comment_key = m[:magic_comment_key].downcase
|
|
275
|
+
magic_comment_value = m[:magic_comment_value].downcase
|
|
276
|
+
|
|
277
|
+
## turn on teletype mode
|
|
278
|
+
## e.g. tty: true or teletype: true
|
|
279
|
+
if ['tty', 'teletype'].include?( magic_comment_key ) &&
|
|
280
|
+
['true'].include?( magic_comment_value )
|
|
281
|
+
puts " magic comment - turn on teletype (tty) mode"
|
|
282
|
+
@teletype = true
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
next
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
####
|
|
293
|
+
# support __END__ marker to cut-off input
|
|
294
|
+
break if line.strip == '__END__'
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
##
|
|
299
|
+
## first check for tabs
|
|
300
|
+
## add error/warn
|
|
301
|
+
## for auto-fix - replace tabs with two spaces
|
|
302
|
+
|
|
303
|
+
line = line.gsub( "\t" ) do |_|
|
|
304
|
+
## report error here
|
|
305
|
+
## todo/add error here
|
|
306
|
+
puts "!! WARN - auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}"
|
|
307
|
+
" " ## replace with two spaces
|
|
200
308
|
end
|
|
201
309
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
310
|
+
|
|
311
|
+
## U+00A0 (160) -- non-breaking space (unicode)
|
|
312
|
+
line = line.gsub( "\u00A0" ) do |uni|
|
|
313
|
+
## report error here
|
|
314
|
+
## todo/add error here
|
|
315
|
+
puts "!! WARN - auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}"
|
|
316
|
+
" " ## replace with space
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
###
|
|
320
|
+
## todo/fix - print unicode numbers for [–−]
|
|
321
|
+
## different candidates to differentiate and document!!!
|
|
322
|
+
## – => U+2013 (8211) -- En Dash (unicode)
|
|
323
|
+
## − => U+2212 (8722) -- Minus Sign (unicode)
|
|
324
|
+
line = line.gsub( /[–−]/ ) do |uni|
|
|
325
|
+
## report error here
|
|
326
|
+
## todo/add error here
|
|
327
|
+
puts "!! WARN - auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}"
|
|
328
|
+
'-' ## replace with ascii dash (-)
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
puts "line: >#{line}<" if debug?
|
|
334
|
+
|
|
335
|
+
######
|
|
336
|
+
### special case for empty line (aka BLANK)
|
|
337
|
+
if line.empty?
|
|
338
|
+
## note - blank always resets parser mode to std/top-level!!!
|
|
339
|
+
@re = RE
|
|
340
|
+
tokens_by_line << [[:BLANK, '<|BLANK|>']]
|
|
341
|
+
elsif (m = HEADING_RE.match(line))
|
|
342
|
+
## note - heading always resets parser mode to std/top-level!!!
|
|
343
|
+
@re = RE
|
|
344
|
+
puts " HEADING" if debug?
|
|
345
|
+
## note - derive heading level from no of (leading) markers
|
|
346
|
+
## e.g. = is 1, == is 2, == is 3, etc.
|
|
347
|
+
heading_level = m[:heading_marker].size
|
|
348
|
+
tokens_by_line << [[:"H#{heading_level}", m[:heading]]]
|
|
349
|
+
elsif (m = NOTA_BENE_RE.match(line))
|
|
350
|
+
## note - nota bene always resets parser mode to std/top-level!!!
|
|
351
|
+
@re = RE
|
|
352
|
+
tokens_by_line << [[:NOTA_BENE, m[:nota_bene]]]
|
|
353
|
+
elsif @re == RE && (m = TABLE_RE.match(line))
|
|
354
|
+
@re = TABLE_MORE_RE ## switch into table mode
|
|
355
|
+
if m[:table_heading]
|
|
356
|
+
tokens_by_line << [[:TABLE_HEADING, m[:table_heading]]]
|
|
357
|
+
else ## assume table (line) e.g. m[:table]
|
|
358
|
+
tokens_by_line << [[:TABLE_LINE, line]]
|
|
359
|
+
end
|
|
360
|
+
elsif @re == TABLE_MORE_RE
|
|
361
|
+
### todo/fix - check if no match and report/add error!!
|
|
362
|
+
## for now (ummatched) line gets auto-added as table line!!!
|
|
363
|
+
##
|
|
364
|
+
## note - MUST be followed by blank line (or nota bene/heading)
|
|
365
|
+
## to switch back into to top-level!!!!
|
|
366
|
+
m = TABLE_MORE_RE.match(line)
|
|
367
|
+
if m[:table_note]
|
|
368
|
+
tokens_by_line << [[:TABLE_NOTE, m[:table_note]]]
|
|
369
|
+
elsif m[:table_divider]
|
|
370
|
+
tokens_by_line << [[:TABLE_DIVIDER, m[:table_divider]]]
|
|
371
|
+
else ## assume table (line) e.g. m[:table]
|
|
372
|
+
tokens_by_line << [[:TABLE_LINE, line]]
|
|
211
373
|
end
|
|
374
|
+
elsif @re != TABLE_MORE_RE && (m = HRULER_RE.match(line))
|
|
375
|
+
## note - hruler (---)
|
|
376
|
+
## will only match if NOT in table mode!!!
|
|
377
|
+
## otherwise
|
|
378
|
+
## hruler always resets parser mode to std/top-level!!!
|
|
379
|
+
@re = RE
|
|
380
|
+
tokens_by_line << [[:HRULER, '<|HRULER|>']]
|
|
381
|
+
elsif @teletype && (@re == RE && IS_TTY_LINE_RE.match(line))
|
|
382
|
+
## try experimental TELETYPE (TTY) mode!!!
|
|
383
|
+
## note - turn on via magic comment e.g. tty/teletype: true
|
|
384
|
+
###
|
|
385
|
+
### move inside _tokenize_line - why? why not?
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
tokens_by_line << _tokenize_tty_line( line )
|
|
389
|
+
|
|
390
|
+
## note - dates such as
|
|
391
|
+
## APR 11 or 11 APR will trigger TELETYPE
|
|
392
|
+
### ## check letter
|
|
393
|
+
else
|
|
394
|
+
|
|
395
|
+
more_tokens, more_errors = _tokenize_line( line )
|
|
396
|
+
|
|
397
|
+
tokens_by_line << more_tokens
|
|
398
|
+
errors += more_errors
|
|
212
399
|
end
|
|
400
|
+
end # each line
|
|
401
|
+
|
|
402
|
+
|
|
213
403
|
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
tokens_by_line = tokens_by_line.map do |tokens|
|
|
214
407
|
#################
|
|
215
|
-
## pass 2
|
|
216
408
|
## transform tokens (using simple patterns)
|
|
217
409
|
## to help along the (racc look ahead 1 - LA1) parser
|
|
218
410
|
nodes = []
|
|
@@ -220,48 +412,72 @@ def tokenize_with_errors
|
|
|
220
412
|
buf = Tokens.new( tokens )
|
|
221
413
|
## pp buf
|
|
222
414
|
|
|
223
|
-
|
|
224
415
|
loop do
|
|
225
416
|
break if buf.eos?
|
|
226
417
|
|
|
227
|
-
if buf.
|
|
228
|
-
## check for
|
|
229
|
-
## group def or round def
|
|
230
|
-
if buf.match?( :ROUND, :'|' ) ## assume round def (change round to round_def)
|
|
231
|
-
nodes << [:ROUND_DEF, buf.next[1]]
|
|
232
|
-
nodes << buf.next
|
|
233
|
-
nodes += buf.collect
|
|
234
|
-
break
|
|
235
|
-
end
|
|
236
|
-
if buf.match?( :GROUP, :'|' ) ## assume group def (change group to group_def)
|
|
237
|
-
nodes << [:GROUP_DEF, buf.next[1]]
|
|
238
|
-
nodes << buf.next
|
|
239
|
-
## change all text to team - why? why not?
|
|
240
|
-
nodes += buf.collect { |t|
|
|
241
|
-
t[0] == :TEXT ? [:TEAM, t[1]] : t
|
|
242
|
-
}
|
|
243
|
-
break
|
|
244
|
-
end
|
|
245
|
-
end
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT )
|
|
249
|
-
nodes << [:TEAM, buf.next[1]]
|
|
250
|
-
nodes << buf.next
|
|
251
|
-
nodes << [:TEAM, buf.next[1]]
|
|
252
|
-
# note - now handled (upstream) with GOAL_RE mode!!!
|
|
253
|
-
# elsif buf.match?( :TEXT, :MINUTE )
|
|
254
|
-
# nodes << [:PLAYER, buf.next[1]]
|
|
255
|
-
# nodes << buf.next
|
|
256
|
-
elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
|
418
|
+
if buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
|
257
419
|
date = buf.next[1]
|
|
258
420
|
time = buf.next[1]
|
|
259
421
|
## puts "DATETIME:"
|
|
260
422
|
## pp date, time
|
|
423
|
+
## note: time value is { time: {} } or
|
|
424
|
+
## { time: {}, time_local {} }
|
|
261
425
|
val = [date[0] + ' ' + time[0], ## concat string of two tokens
|
|
262
|
-
{ date: date[1]
|
|
426
|
+
{ date: date[1] }.merge( time[1] )
|
|
427
|
+
]
|
|
428
|
+
nodes << [:DATETIME, val]
|
|
429
|
+
### support date time with comma too - why? why not?
|
|
430
|
+
elsif buf.match?( :DATE, :',', :TIME )
|
|
431
|
+
date = buf.next[1]
|
|
432
|
+
_ = buf.next ## ignore comma
|
|
433
|
+
time = buf.next[1]
|
|
434
|
+
## puts "DATETIME:"
|
|
435
|
+
## pp date, time
|
|
436
|
+
val = [date[0] + ', ' + time[0], ## concat string of two tokens
|
|
437
|
+
{ date: date[1] }.merge( time[1] )
|
|
438
|
+
]
|
|
439
|
+
nodes << [:DATETIME, val]
|
|
440
|
+
elsif buf.match?( :TEAM, :SCORE_TEAM )
|
|
441
|
+
## merge TEAM SCORE_TEAM into TEAMALT
|
|
442
|
+
## (use TEAMENTRY or TEAMRESULT - why? why not?)
|
|
443
|
+
team = buf.next[1]
|
|
444
|
+
score_team = buf.next[1]
|
|
445
|
+
val = [team + ' ' + score_team[0], ## concat string of two tokens
|
|
446
|
+
{ team: team }.merge( score_team[1] )
|
|
447
|
+
]
|
|
448
|
+
nodes << [:TEAMALT, val]
|
|
449
|
+
elsif buf.match?( :TEAM, :SCORE_TEAM_PEN )
|
|
450
|
+
team = buf.next[1]
|
|
451
|
+
score_team_pen = buf.next[1]
|
|
452
|
+
val = [team + ' ' + score_team_pen[0], ## concat string of two tokens
|
|
453
|
+
{ team: team }.merge( score_team_pen[1] )
|
|
454
|
+
]
|
|
455
|
+
nodes << [:TEAMALT_PEN, val]
|
|
456
|
+
elsif buf.match?( :TEAM, :SCORE_TEAM_NUM )
|
|
457
|
+
team = buf.next[1]
|
|
458
|
+
score_team_num = buf.next[1]
|
|
459
|
+
val = [team + ' ' + score_team_num[0], ## concat string of two tokens
|
|
460
|
+
{ team: team }.merge( score_team_num[1] )
|
|
263
461
|
]
|
|
264
|
-
nodes << [:
|
|
462
|
+
nodes << [:TEAMALT_NUM, val]
|
|
463
|
+
elsif buf.match?( :GOAL_MINUTE, :',', :GOAL_MINUTE )
|
|
464
|
+
## note - only advance by two tokens!
|
|
465
|
+
## allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
|
|
466
|
+
##
|
|
467
|
+
## help parser with comma shift/reduce conflict
|
|
468
|
+
## change ',' to GOAL_MINUTE_SEP !!!
|
|
469
|
+
nodes << buf.next ## pass through goal_minute
|
|
470
|
+
_ = buf.next ## eat-up goal_minute_sep a.k.a. comma (,)
|
|
471
|
+
## and replace with dedicated sep(arator)
|
|
472
|
+
nodes << [:GOAL_MINUTE_SEP,"<|GOAL_MINUTE_SEP|>"]
|
|
473
|
+
elsif buf.match?( :',', :INLINE_ATTENDANCE )
|
|
474
|
+
## note - allow optional comma before inline attendance
|
|
475
|
+
## help parser with comma shift/reduce conflict
|
|
476
|
+
## change ',' to INLINE_ATTENDANCE_SEP !!!
|
|
477
|
+
nodes << [:INLINE_ATTENDANCE_SEP, "<|INLINE_ATTENDANCE_SEP|>"]
|
|
478
|
+
_ = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,)
|
|
479
|
+
## and replace with dedicated sep(arator)
|
|
480
|
+
nodes << buf.next ## pass through inline_attendance
|
|
265
481
|
else
|
|
266
482
|
## pass through
|
|
267
483
|
nodes << buf.next
|
|
@@ -271,6 +487,7 @@ def tokenize_with_errors
|
|
|
271
487
|
end # map tokens_by_line
|
|
272
488
|
|
|
273
489
|
|
|
490
|
+
|
|
274
491
|
|
|
275
492
|
## flatten tokens
|
|
276
493
|
tokens = []
|
|
@@ -280,9 +497,49 @@ def tokenize_with_errors
|
|
|
280
497
|
pp tok
|
|
281
498
|
end
|
|
282
499
|
|
|
500
|
+
|
|
501
|
+
###############
|
|
502
|
+
## "hacky" (automagic) line merges (remove newline)
|
|
503
|
+
## if line start with @ - check if incl. teams
|
|
504
|
+
|
|
505
|
+
###
|
|
506
|
+
### quick merge lines hack
|
|
507
|
+
## if line starts with geo-marker token @
|
|
508
|
+
## check if line incl. TEAM
|
|
509
|
+
## if yes, leave alone
|
|
510
|
+
## otherwise merge line into previous line!!
|
|
511
|
+
## - todo/fix - handle in possibly in grammar!!!
|
|
512
|
+
## for now match_line CAN start with @ London
|
|
513
|
+
## resulting in parser conflict(s)!!!
|
|
514
|
+
## e.g.
|
|
515
|
+
## England v Scotland
|
|
516
|
+
## @ London
|
|
517
|
+
## =>
|
|
518
|
+
## England v Scotland @ London
|
|
519
|
+
##
|
|
520
|
+
|
|
521
|
+
##
|
|
522
|
+
## note/todo - if INDENT / SPACES get added
|
|
523
|
+
## adjust here
|
|
524
|
+
## tok[0][0] == :INDENT (or :SPACES) &&
|
|
525
|
+
## tok[1][0] == :'@'
|
|
526
|
+
|
|
527
|
+
if tok[0] && tok[0][0] == :'@'
|
|
528
|
+
team = tok.find { |t| t[0] == :TEAM }
|
|
529
|
+
if team
|
|
530
|
+
## do nothing - keep as is (assume match_line starting w/ @)
|
|
531
|
+
else
|
|
532
|
+
## no team(s) found in line
|
|
533
|
+
## remove last token (that is, NEWLINE)
|
|
534
|
+
## note - possibly is blank ?! keep blank
|
|
535
|
+
tokens.pop if tokens[-1][0] == :NEWLINE
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
|
|
283
540
|
tokens += tok
|
|
284
541
|
## auto-add newlines (unless BLANK!!)
|
|
285
|
-
tokens << [:NEWLINE, "\n"] unless tok[0][0] == :BLANK
|
|
542
|
+
tokens << [:NEWLINE, "\n"] unless tok[0] && tok[0][0] == :BLANK
|
|
286
543
|
end
|
|
287
544
|
|
|
288
545
|
[tokens,errors]
|
|
@@ -290,42 +547,11 @@ end # method tokenize_with_errors
|
|
|
290
547
|
|
|
291
548
|
|
|
292
549
|
|
|
293
|
-
### add a QUICK_PLAYER_WITH_MINUTE check
|
|
294
|
-
QUICK_PLAYER_WITH_MINUTE_RE = %r{
|
|
295
|
-
## note - \b NOT working for ? !!!
|
|
296
|
-
##
|
|
297
|
-
## use positive lookbehind
|
|
298
|
-
(?<= [ ,;\(\)\[\]]|^)
|
|
299
|
-
|
|
300
|
-
(?:
|
|
301
|
-
(?:
|
|
302
|
-
\d{1,3} ## constrain numbers to 0 to 999!!!
|
|
303
|
-
(?: \+\d{1,3}
|
|
304
|
-
)?
|
|
305
|
-
)
|
|
306
|
-
|
|
|
307
|
-
(?: \?{2} | _{2} ) ## add support for n/a (not/available)
|
|
308
|
-
)
|
|
309
|
-
' ## must have minute marker!!!!
|
|
310
|
-
}ix
|
|
311
|
-
|
|
312
550
|
|
|
313
551
|
def _tokenize_line( line )
|
|
314
552
|
tokens = []
|
|
315
553
|
errors = [] ## keep a list of errors - why? why not?
|
|
316
554
|
|
|
317
|
-
puts "line: >#{line}<" if debug?
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
### special case for empty line (aka BLANK)
|
|
321
|
-
if line.empty?
|
|
322
|
-
## note - blank always resets parser mode to std/top-level!!!
|
|
323
|
-
@re = RE
|
|
324
|
-
|
|
325
|
-
tokens << [:BLANK, '<|BLANK|>']
|
|
326
|
-
return [tokens, errors]
|
|
327
|
-
end
|
|
328
|
-
|
|
329
555
|
|
|
330
556
|
pos = 0
|
|
331
557
|
## track last offsets - to report error on no match
|
|
@@ -333,6 +559,9 @@ def _tokenize_line( line )
|
|
|
333
559
|
offsets = [0,0]
|
|
334
560
|
m = nil
|
|
335
561
|
|
|
562
|
+
## track number of geo text seen
|
|
563
|
+
## (use for - do NOT break on two spaces if no geo text seen yet!!)
|
|
564
|
+
geo_count = 0
|
|
336
565
|
|
|
337
566
|
####
|
|
338
567
|
## quick hack - keep re state/mode between tokenize calls!!!
|
|
@@ -342,34 +571,76 @@ def _tokenize_line( line )
|
|
|
342
571
|
if @re == RE ## top-level
|
|
343
572
|
### check for modes once (per line) here to speed-up parsing
|
|
344
573
|
### for now goals only possible for start of line!!
|
|
345
|
-
### fix - remove optional [] - why? why not?
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
##
|
|
349
|
-
|
|
574
|
+
### fix - remove optional [] - why? why not?
|
|
575
|
+
|
|
576
|
+
####
|
|
577
|
+
## note - ord e.g. (45) for match number can only start a (match) line
|
|
578
|
+
## "inline" use NOT possible
|
|
579
|
+
## note - ord (for ordinal number!!!) e.g match number (1), (42), etc.
|
|
580
|
+
if (m = START_WITH_ORD.match(line))
|
|
581
|
+
## note - strip enclosing () and convert to integer
|
|
582
|
+
tokens << [:ORD, [m[:ord], { value: m[:value].to_i(10) } ]]
|
|
583
|
+
|
|
584
|
+
offsets = [m.begin(0), m.end(0)]
|
|
585
|
+
pos = offsets[1] ## update pos
|
|
586
|
+
elsif (m = START_WITH_YEAR.match(line))
|
|
587
|
+
## note - strip enclosing () and convert to integer
|
|
588
|
+
tokens << [:YEAR, m[:year].to_i(10)]
|
|
589
|
+
|
|
590
|
+
offsets = [m.begin(0), m.end(0)]
|
|
591
|
+
pos = offsets[1] ## update pos
|
|
592
|
+
|
|
593
|
+
###
|
|
594
|
+
## todo/fix - rename to START_GROUP_DEF_LINE_RE !!!!
|
|
595
|
+
elsif (m = GROUP_DEF_LINE_RE.match( line ))
|
|
596
|
+
puts " ENTER GROUP_DEF_RE MODE" if debug?
|
|
597
|
+
@re = GROUP_DEF_RE
|
|
598
|
+
|
|
599
|
+
tokens << [:GROUP_DEF, m[:group_def]]
|
|
600
|
+
|
|
601
|
+
offsets = [m.begin(0), m.end(0)]
|
|
602
|
+
pos = offsets[1] ## update pos
|
|
603
|
+
|
|
604
|
+
### todo/fix - rename to PROP_KEY_RE to START_WITH_PROP_KEY_RE !!!
|
|
605
|
+
elsif (m = PROP_KEY_RE.match( line ))
|
|
606
|
+
## start with prop key (match will switch into prop mode!!!)
|
|
607
|
+
## - fix - remove leading spaces in regex (upstream) - why? why not?
|
|
608
|
+
##
|
|
350
609
|
### switch into new mode
|
|
351
610
|
## switch context to PROP_RE
|
|
352
611
|
puts " ENTER PROP_RE MODE" if debug?
|
|
353
612
|
key = m[:key]
|
|
354
613
|
|
|
355
614
|
|
|
356
|
-
### todo - add prop yellow/red cards too - why? why not?
|
|
357
|
-
|
|
615
|
+
### todo/fix - add prop yellow/red cards too - why? why not?
|
|
616
|
+
## todo/fix - separate sent off and red card
|
|
617
|
+
## sent-off - incl. red card, yellow/red card and the era before red cards!!
|
|
618
|
+
if ['sent off'].include?( key.downcase)
|
|
619
|
+
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
|
620
|
+
tokens << [:PROP_SENTOFF, m[:key]]
|
|
621
|
+
elsif ['red cards'].include?( key.downcase )
|
|
358
622
|
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
|
359
623
|
tokens << [:PROP_REDCARDS, m[:key]]
|
|
360
624
|
elsif ['yellow cards'].include?( key.downcase )
|
|
361
625
|
@re = PROP_CARDS_RE
|
|
362
626
|
tokens << [:PROP_YELLOWCARDS, m[:key]]
|
|
363
|
-
elsif ['ref', 'referee'
|
|
627
|
+
elsif ['ref', 'referee',
|
|
628
|
+
'refs', 'referees' ## note - allow/support assistant refs
|
|
629
|
+
].include?( key.downcase )
|
|
364
630
|
@re = PROP_REFEREE_RE
|
|
365
631
|
tokens << [:PROP_REFEREE, m[:key]]
|
|
366
632
|
elsif ['att', 'attn', 'attendance'].include?( key.downcase )
|
|
367
633
|
@re = PROP_ATTENDANCE_RE
|
|
368
634
|
tokens << [:PROP_ATTENDANCE, m[:key]]
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
635
|
+
|
|
636
|
+
# elsif ['goals'].include?( key.downcase )
|
|
637
|
+
# @re = PROP_GOAL_RE
|
|
638
|
+
# tokens << [:PROP_GOALS, m[:key]]
|
|
639
|
+
|
|
640
|
+
elsif ['penalties',
|
|
641
|
+
'penalty shootout',
|
|
642
|
+
'penalty shoot-out',
|
|
643
|
+
'penalty kicks'].include?( key.downcase )
|
|
373
644
|
@re = PROP_PENALTIES_RE
|
|
374
645
|
tokens << [:PROP_PENALTIES, m[:key]]
|
|
375
646
|
else ## assume (team) line-up
|
|
@@ -379,63 +650,69 @@ def _tokenize_line( line )
|
|
|
379
650
|
|
|
380
651
|
offsets = [m.begin(0), m.end(0)]
|
|
381
652
|
pos = offsets[1] ## update pos
|
|
653
|
+
###
|
|
654
|
+
### todo/fix
|
|
655
|
+
### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
|
|
656
|
+
elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
|
|
657
|
+
puts " ENTER ROUND_DEF_RE MODE" if debug?
|
|
658
|
+
@re = ROUND_DEF_RE
|
|
659
|
+
|
|
660
|
+
## note - return ROUND_DEF NOT ROUND_OUTLINE token
|
|
661
|
+
tokens << [:ROUND_DEF, m[:round_outline]]
|
|
662
|
+
|
|
663
|
+
offsets = [m.begin(0), m.end(0)]
|
|
664
|
+
pos = offsets[1] ## update pos
|
|
382
665
|
elsif (m = ROUND_OUTLINE_RE.match( line ))
|
|
383
666
|
puts " ROUND_OUTLINE" if debug?
|
|
667
|
+
## note - derive round level from no of (leading) markers
|
|
668
|
+
## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
|
|
669
|
+
## note - ascii-style starts with double ::, thus, autodecrement by one!
|
|
670
|
+
round_level = m[:round_marker].size
|
|
671
|
+
round_level -= 1 if m[:round_marker].start_with?( '::' )
|
|
384
672
|
|
|
385
|
-
tokens << [:ROUND_OUTLINE, m[:round_outline]
|
|
673
|
+
tokens << [:ROUND_OUTLINE, [m[:round_outline],
|
|
674
|
+
{ outline: m[:round_outline] ,
|
|
675
|
+
level: round_level}]]
|
|
386
676
|
|
|
387
677
|
## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
|
|
388
678
|
offsets = [m.begin(0), m.end(0)]
|
|
389
679
|
pos = offsets[1] ## update pos
|
|
390
|
-
elsif (m =
|
|
391
|
-
## switch context to GOAL_RE (goalline(s)
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
680
|
+
elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume
|
|
681
|
+
## switch context to GOAL_RE (goalline(s))
|
|
682
|
+
####
|
|
683
|
+
## note - check for alternate goal line styles / formats
|
|
684
|
+
if START_GOAL_LINE_COMPAT_RE.match(line )
|
|
685
|
+
## "legacy" style starting with minute e.g.
|
|
686
|
+
## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
|
|
687
|
+
## 84 Rahn 3-2)
|
|
688
|
+
@re = GOAL_COMPAT_RE
|
|
689
|
+
puts " ENTER GOAL_COMPAT_RE MODE" if debug?
|
|
690
|
+
|
|
691
|
+
tokens << [:GOALS_COMPAT, "<|GOALS_COMPAT|>"]
|
|
692
|
+
elsif START_GOAL_LINE_ALT_RE.match( line )
|
|
693
|
+
## goals with scores e.g.
|
|
694
|
+
## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
|
|
695
|
+
## -or-
|
|
696
|
+
## (Dion Beljo 1-0
|
|
697
|
+
## 1-1 Andreas Gruber
|
|
698
|
+
## Matthias Seidl 2-1)
|
|
699
|
+
@re = GOAL_ALT_RE
|
|
700
|
+
puts " ENTER GOAL_ALT_RE MODE" if debug?
|
|
701
|
+
|
|
702
|
+
tokens << [:GOALS_ALT, "<|GOALS_ALT|>"]
|
|
703
|
+
else
|
|
704
|
+
## "standard" / default style
|
|
705
|
+
@re = GOAL_RE
|
|
706
|
+
puts " ENTER GOAL_RE MODE" if debug?
|
|
410
707
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
### FIX - improve / rework PLAYER_WITH_MINUTE_RE regex!!!!
|
|
414
|
-
elsif (_quick = QUICK_PLAYER_WITH_MINUTE_RE.match(line) &&
|
|
415
|
-
m = PLAYER_WITH_MINUTE_RE.match( line ))
|
|
416
|
-
## switch context to GOAL_RE (goalline(s)
|
|
417
|
-
## split token (automagically) into two!! - player AND minute!!!
|
|
418
|
-
@re = GOAL_RE
|
|
419
|
-
puts " ENTER GOAL_RE MODE" if debug?
|
|
420
|
-
|
|
421
|
-
## check for optional open_bracket
|
|
422
|
-
tokens << [:'['] if m[:open_bracket]
|
|
423
|
-
|
|
424
|
-
## check for -; (none with separator)
|
|
425
|
-
## todo - find a better way? how possible?
|
|
426
|
-
tokens << [:NONE, "<|NONE|>"] if m[:none]
|
|
427
|
-
|
|
428
|
-
## auto-add player token first
|
|
429
|
-
tokens << [:PLAYER, m[:name]]
|
|
430
|
-
## minute props
|
|
431
|
-
minute = {}
|
|
432
|
-
minute[:m] = m[:value].to_i(10)
|
|
433
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
434
|
-
## t is minute only
|
|
435
|
-
tokens << [:MINUTE, [m[:minute], minute]]
|
|
708
|
+
tokens << [:GOALS, "<|GOALS|>"]
|
|
709
|
+
end
|
|
436
710
|
|
|
711
|
+
## note - eat-up ( for now
|
|
712
|
+
## pass along "virtual" GOALS or GOALS_ALT token
|
|
713
|
+
## (see INLINE_GOALS for the starting goal line inline)
|
|
437
714
|
offsets = [m.begin(0), m.end(0)]
|
|
438
|
-
pos = offsets[1] ## update pos
|
|
715
|
+
pos = offsets[1] ## update pos
|
|
439
716
|
end
|
|
440
717
|
end
|
|
441
718
|
|
|
@@ -475,24 +752,105 @@ def _tokenize_line( line )
|
|
|
475
752
|
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
|
476
753
|
## for VAL use "text" or ["text", { opts }] array
|
|
477
754
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
[:
|
|
484
|
-
|
|
485
|
-
[:
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
755
|
+
|
|
756
|
+
t = if @re == ROUND_DEF_RE
|
|
757
|
+
if m[:spaces] || m[:space]
|
|
758
|
+
nil ## skip spaces
|
|
759
|
+
elsif m[:date]
|
|
760
|
+
[:DATE, [m[:date], _build_date( m )]]
|
|
761
|
+
elsif m[:duration]
|
|
762
|
+
[:DURATION, [m[:duration], _build_duration( m )]]
|
|
763
|
+
elsif m[:sym]
|
|
764
|
+
sym = m[:sym]
|
|
765
|
+
case sym
|
|
766
|
+
when '|' then [:'|']
|
|
767
|
+
when ':' then [:':']
|
|
768
|
+
when ',' then [:',']
|
|
769
|
+
else
|
|
770
|
+
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
771
|
+
nil ## ignore others (e.g. brackets [])
|
|
772
|
+
end
|
|
773
|
+
elsif m[:any]
|
|
774
|
+
## todo/check log error
|
|
775
|
+
msg = "parse error (tokenize round_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
776
|
+
puts "!! WARN - #{msg}"
|
|
777
|
+
|
|
778
|
+
errors << msg
|
|
779
|
+
log( "!! WARN - #{msg}" )
|
|
780
|
+
|
|
781
|
+
nil
|
|
782
|
+
else
|
|
783
|
+
## report error/raise expection
|
|
784
|
+
puts "!!! TOKENIZE ERROR - no match found"
|
|
785
|
+
nil
|
|
786
|
+
end
|
|
787
|
+
elsif @re == GROUP_DEF_RE
|
|
788
|
+
if m[:spaces] || m[:space]
|
|
789
|
+
nil ## skip spaces
|
|
790
|
+
elsif m[:text]
|
|
791
|
+
[:TEAM, m[:text]]
|
|
792
|
+
elsif m[:sym]
|
|
793
|
+
sym = m[:sym]
|
|
794
|
+
case sym
|
|
795
|
+
when '|' then [:'|']
|
|
796
|
+
when ':' then [:':']
|
|
797
|
+
when ',' then [:',']
|
|
798
|
+
else
|
|
799
|
+
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
800
|
+
nil ## ignore others (e.g. brackets [])
|
|
801
|
+
end
|
|
802
|
+
elsif m[:any]
|
|
803
|
+
## todo/check log error
|
|
804
|
+
msg = "parse error (tokenize group_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
805
|
+
puts "!! WARN - #{msg}"
|
|
806
|
+
|
|
807
|
+
errors << msg
|
|
808
|
+
log( "!! WARN - #{msg}" )
|
|
809
|
+
|
|
810
|
+
nil
|
|
811
|
+
else
|
|
812
|
+
## report error/raise expection
|
|
813
|
+
puts "!!! TOKENIZE ERROR - no match found"
|
|
814
|
+
nil
|
|
815
|
+
end
|
|
816
|
+
elsif @re == GEO_RE
|
|
817
|
+
### note - possibly end inline geo on [ (and others?? in the future
|
|
818
|
+
## note: break on double spaces e.g.
|
|
819
|
+
## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England
|
|
820
|
+
if m[:spaces]
|
|
821
|
+
### note - do NOT break out
|
|
822
|
+
## if not text seen yet!!!
|
|
823
|
+
if geo_count > 0
|
|
824
|
+
## get out-off geo mode and backtrack (w/ next)
|
|
825
|
+
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
826
|
+
@re = RE
|
|
827
|
+
pos = old_pos
|
|
828
|
+
next ## backtrack (resume new loop step)
|
|
829
|
+
else
|
|
830
|
+
nil ## skip spaces
|
|
831
|
+
end
|
|
832
|
+
elsif m[:space]
|
|
833
|
+
nil ## skip (single) space
|
|
834
|
+
elsif m[:text]
|
|
835
|
+
geo_count += 1
|
|
836
|
+
[:GEO, m[:text]] ## keep pos - why? why not?
|
|
837
|
+
elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!!
|
|
838
|
+
## get out-off geo mode and backtrack (w/ next)
|
|
839
|
+
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
840
|
+
@re = RE
|
|
841
|
+
pos = old_pos
|
|
842
|
+
next ## backtrack (resume new loop step)
|
|
843
|
+
elsif m[:sym]
|
|
844
|
+
sym = m[:sym]
|
|
845
|
+
## return symbols "inline" as is - why? why not?
|
|
846
|
+
## (?<sym>[;,@|\[\]-])
|
|
847
|
+
case sym
|
|
848
|
+
## note - reset geo_count to 0 (avoids break on two spaces)
|
|
849
|
+
## if separator seen!!
|
|
850
|
+
when ',' then geo_count = 0; [:',']
|
|
851
|
+
when '›' then geo_count = 0; [:','] ## note - treat geo sep › (unicode) like comma for now!!!
|
|
852
|
+
when '>' then geo_count = 0; [:','] ## note - treat geo sep > (ascii) like comma for now!!!
|
|
853
|
+
when '[' then
|
|
496
854
|
## get out-off geo mode and backtrack (w/ next)
|
|
497
855
|
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
498
856
|
@re = RE
|
|
@@ -554,19 +912,29 @@ def _tokenize_line( line )
|
|
|
554
912
|
## report error - for unknown (inline) prop key in lineup
|
|
555
913
|
nil
|
|
556
914
|
end
|
|
915
|
+
elsif m[:inline_captain]
|
|
916
|
+
[:INLINE_CAPTAIN, m[:inline_captain]]
|
|
917
|
+
elsif m[:inline_yellow]
|
|
918
|
+
card = {}
|
|
919
|
+
card[:m] = m[:minute].to_i(10) if m[:minute]
|
|
920
|
+
card[:offset] = m[:offset].to_i(10) if m[:offset]
|
|
921
|
+
[:INLINE_YELLOW, [m[:inline_yellow], card]]
|
|
922
|
+
elsif m[:inline_red]
|
|
923
|
+
card = {}
|
|
924
|
+
card[:m] = m[:minute].to_i(10) if m[:minute]
|
|
925
|
+
card[:offset] = m[:offset].to_i(10) if m[:offset]
|
|
926
|
+
[:INLINE_RED, [m[:inline_red], card]]
|
|
927
|
+
elsif m[:inline_yellow_red]
|
|
928
|
+
card = {}
|
|
929
|
+
card[:m] = m[:minute].to_i(10) if m[:minute]
|
|
930
|
+
card[:offset] = m[:offset].to_i(10) if m[:offset]
|
|
931
|
+
[:INLINE_YELLOW_RED, [m[:inline_yellow_red], card]]
|
|
557
932
|
elsif m[:prop_name]
|
|
558
|
-
|
|
559
|
-
[:YELLOW_CARD, m[:name]]
|
|
560
|
-
elsif m[:name] == 'R'
|
|
561
|
-
[:RED_CARD, m[:name]]
|
|
562
|
-
else
|
|
563
|
-
[:PROP_NAME, m[:name]]
|
|
564
|
-
end
|
|
933
|
+
[:PROP_NAME, m[:name]]
|
|
565
934
|
elsif m[:minute]
|
|
566
935
|
minute = {}
|
|
567
936
|
minute[:m] = m[:value].to_i(10)
|
|
568
937
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
569
|
-
## note - for debugging keep (pass along) "literal" minute
|
|
570
938
|
[:MINUTE, [m[:minute], minute]]
|
|
571
939
|
elsif m[:sym]
|
|
572
940
|
sym = m[:sym]
|
|
@@ -661,9 +1029,8 @@ def _tokenize_line( line )
|
|
|
661
1029
|
## must always have ft for now e.g. 1-1 or such
|
|
662
1030
|
### change to (generic) score from ft -
|
|
663
1031
|
## might be score a.e.t. or such - why? why not?
|
|
664
|
-
score[:
|
|
665
|
-
|
|
666
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1032
|
+
score[:score] = [m[:score1].to_i(10),
|
|
1033
|
+
m[:score2].to_i(10)]
|
|
667
1034
|
[:SCORE, [m[:score], score]]
|
|
668
1035
|
elsif m[:sym]
|
|
669
1036
|
sym = m[:sym]
|
|
@@ -680,30 +1047,107 @@ def _tokenize_line( line )
|
|
|
680
1047
|
puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
|
|
681
1048
|
nil
|
|
682
1049
|
end
|
|
683
|
-
elsif @re ==
|
|
1050
|
+
elsif @re == GOAL_COMPAT_RE
|
|
684
1051
|
if m[:space] || m[:spaces]
|
|
685
1052
|
nil ## skip space(s)
|
|
686
1053
|
elsif m[:prop_name] ## note - change prop_name to player
|
|
687
1054
|
[:PLAYER, m[:name]]
|
|
688
1055
|
elsif m[:minute]
|
|
689
|
-
minute =
|
|
690
|
-
minute[:m] = m[:value].to_i(10)
|
|
691
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
692
|
-
## note - for debugging keep (pass along) "literal" minute
|
|
1056
|
+
minute = _build_minute( m )
|
|
693
1057
|
[:MINUTE, [m[:minute], minute]]
|
|
1058
|
+
elsif m[:goal_type]
|
|
1059
|
+
goal_type = _build_goal_type( m )
|
|
1060
|
+
[:GOAL_TYPE, [m[:goal_type], goal_type]]
|
|
694
1061
|
elsif m[:score]
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
elsif m[:
|
|
706
|
-
|
|
1062
|
+
score = {}
|
|
1063
|
+
## note - score is "generic"
|
|
1064
|
+
## might be full-time (ft) or
|
|
1065
|
+
## after extra-time (aet) or such
|
|
1066
|
+
## or even undecided/unknown
|
|
1067
|
+
## thus, use score1/score2 and NOT ft1/ft2
|
|
1068
|
+
score[:score] = [m[:score1].to_i(10),
|
|
1069
|
+
m[:score2].to_i(10)]
|
|
1070
|
+
## note - for debugging keep (pass along) "literal" score
|
|
1071
|
+
[:SCORE, [m[:score], score]]
|
|
1072
|
+
elsif m[:sym]
|
|
1073
|
+
sym = m[:sym]
|
|
1074
|
+
## return symbols "inline" as is - why? why not?
|
|
1075
|
+
## (?<sym>[;,@|\[\]-])
|
|
1076
|
+
|
|
1077
|
+
case sym
|
|
1078
|
+
when ',' then [:',']
|
|
1079
|
+
when ')' ## leave goal mode!!
|
|
1080
|
+
puts " LEAVE GOAL_COMPAT_RE MODE" if debug?
|
|
1081
|
+
@re = RE
|
|
1082
|
+
## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
|
|
1083
|
+
## or GOAL_PAREN_CLOSE/END ???
|
|
1084
|
+
[:GOALS_END, '<|GOALS_END|>']
|
|
1085
|
+
else
|
|
1086
|
+
nil ## ignore others (e.g. brackets [])
|
|
1087
|
+
end
|
|
1088
|
+
else
|
|
1089
|
+
## report error
|
|
1090
|
+
puts "!!! TOKENIZE ERROR (GOAL_COMPAT_RE) - no match found"
|
|
1091
|
+
nil
|
|
1092
|
+
end
|
|
1093
|
+
elsif @re == GOAL_ALT_RE
|
|
1094
|
+
if m[:space] || m[:spaces]
|
|
1095
|
+
nil ## skip space(s)
|
|
1096
|
+
elsif m[:prop_name] ## note - change prop_name to player
|
|
1097
|
+
[:PLAYER, m[:name]]
|
|
1098
|
+
elsif m[:goal_minute]
|
|
1099
|
+
minute = _build_goal_minute( m )
|
|
1100
|
+
[:GOAL_MINUTE, [m[:goal_minute], minute]]
|
|
1101
|
+
elsif m[:goal_type]
|
|
1102
|
+
goal_type = _build_goal_type( m )
|
|
1103
|
+
[:GOAL_TYPE, [m[:goal_type], goal_type]]
|
|
1104
|
+
elsif m[:score]
|
|
1105
|
+
score = {}
|
|
1106
|
+
## note - score is "generic"
|
|
1107
|
+
## might be full-time (ft) or
|
|
1108
|
+
## after extra-time (aet) or such
|
|
1109
|
+
## or even undecided/unknown
|
|
1110
|
+
## thus, use score1/score2 and NOT ft1/ft2
|
|
1111
|
+
score[:score] = [m[:score1].to_i(10),
|
|
1112
|
+
m[:score2].to_i(10)]
|
|
1113
|
+
## note - for debugging keep (pass along) "literal" score
|
|
1114
|
+
[:SCORE, [m[:score], score]]
|
|
1115
|
+
elsif m[:sym]
|
|
1116
|
+
sym = m[:sym]
|
|
1117
|
+
## return symbols "inline" as is - why? why not?
|
|
1118
|
+
## (?<sym>[;,@|\[\]-])
|
|
1119
|
+
|
|
1120
|
+
case sym
|
|
1121
|
+
when ',' then [:',']
|
|
1122
|
+
when ')' ## leave goal mode!!
|
|
1123
|
+
puts " LEAVE GOAL_ALT_RE MODE" if debug?
|
|
1124
|
+
@re = RE
|
|
1125
|
+
## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
|
|
1126
|
+
## or GOAL_PAREN_CLOSE/END ???
|
|
1127
|
+
[:GOALS_END, '<|GOALS_END|>']
|
|
1128
|
+
else
|
|
1129
|
+
nil ## ignore others (e.g. brackets [])
|
|
1130
|
+
end
|
|
1131
|
+
else
|
|
1132
|
+
## report error
|
|
1133
|
+
puts "!!! TOKENIZE ERROR (GOAL_ALT_RE) - no match found"
|
|
1134
|
+
nil
|
|
1135
|
+
end
|
|
1136
|
+
elsif @re == GOAL_RE
|
|
1137
|
+
if m[:space] || m[:spaces]
|
|
1138
|
+
nil ## skip space(s)
|
|
1139
|
+
elsif m[:goals_none] ## note - eats-up semicolon!! e.g. -; or - ;
|
|
1140
|
+
[:GOALS_NONE, "<|GOALS_NONE|>"]
|
|
1141
|
+
elsif m[:goal_sep_alt]
|
|
1142
|
+
[:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ] ## e.g. dash (-) WITH leading & trailing space required
|
|
1143
|
+
elsif m[:prop_name] ## note - change prop_name to player
|
|
1144
|
+
[:PLAYER, m[:name]]
|
|
1145
|
+
elsif m[:goal_minute]
|
|
1146
|
+
minute = _build_goal_minute( m )
|
|
1147
|
+
[:GOAL_MINUTE, [m[:goal_minute], minute]]
|
|
1148
|
+
elsif m[:goal_count]
|
|
1149
|
+
count = _build_goal_count( m )
|
|
1150
|
+
[:GOAL_COUNT, [m[:goal_count], count]]
|
|
707
1151
|
elsif m[:sym]
|
|
708
1152
|
sym = m[:sym]
|
|
709
1153
|
## return symbols "inline" as is - why? why not?
|
|
@@ -712,8 +1156,14 @@ def _tokenize_line( line )
|
|
|
712
1156
|
case sym
|
|
713
1157
|
when ',' then [:',']
|
|
714
1158
|
when ';' then [:';']
|
|
715
|
-
when '[' then [:'[']
|
|
716
|
-
when ']' then [:']']
|
|
1159
|
+
# when '[' then [:'[']
|
|
1160
|
+
# when ']' then [:']']
|
|
1161
|
+
when ')' ## leave goal mode!!
|
|
1162
|
+
puts " LEAVE GOAL_RE MODE" if debug?
|
|
1163
|
+
@re = RE
|
|
1164
|
+
## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
|
|
1165
|
+
## or GOAL_PAREN_CLOSE/END ???
|
|
1166
|
+
[:GOALS_END, '<|GOALS_END|>']
|
|
717
1167
|
else
|
|
718
1168
|
nil ## ignore others (e.g. brackets [])
|
|
719
1169
|
end
|
|
@@ -728,74 +1178,112 @@ def _tokenize_line( line )
|
|
|
728
1178
|
if m[:space] || m[:spaces]
|
|
729
1179
|
nil ## skip space(s)
|
|
730
1180
|
elsif m[:text]
|
|
731
|
-
|
|
1181
|
+
## note - top-level (for now always) assumes TEAM for TEXT match!!
|
|
1182
|
+
[:TEAM, m[:text]] ## keep pos - why? why not?
|
|
732
1183
|
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
1184
|
+
[:STATUS, [m[:status], _build_status( m ) ]]
|
|
1185
|
+
elsif m[:inline_wo] ## w/o - walkover (match status)
|
|
1186
|
+
[:INLINE_WO, m[:inline_wo]]
|
|
1187
|
+
elsif m[:inline_np] ## n/p - not played (match status)
|
|
1188
|
+
[:INLINE_NP, m[:inline_np]]
|
|
1189
|
+
elsif m[:inline_bye] ## bye (match status)
|
|
1190
|
+
[:INLINE_BYE, m[:inline_bye]]
|
|
1191
|
+
elsif m[:inline_abd] ## abd/abd. - abandoned (match status)
|
|
1192
|
+
[:INLINE_ABD, m[:inline_abd]]
|
|
1193
|
+
elsif m[:inline_void] ## abd/abd. - abandoned (match status)
|
|
1194
|
+
[:INLINE_VOID, m[:inline_void]]
|
|
1195
|
+
elsif m[:inline_susp] ## susp/susp. - suspended (match status)
|
|
1196
|
+
[:INLINE_SUSP, m[:inline_susp]]
|
|
1197
|
+
elsif m[:inline_ppd] ## ppd/ppd. or postp/postp. - postponed (match status)
|
|
1198
|
+
[:INLINE_PPD, m[:inline_ppd]]
|
|
1199
|
+
elsif m[:inline_awd] ## awd/awd. - awarded (match status)
|
|
1200
|
+
[:INLINE_AWD, m[:inline_awd]]
|
|
1201
|
+
elsif m[:inline_canc] ## canc/canc. - cancelled/canceled (match status)
|
|
1202
|
+
[:INLINE_CANC, m[:inline_canc]]
|
|
1203
|
+
|
|
1204
|
+
elsif m[:team_home]
|
|
1205
|
+
[:TEAM_HOME, m[:team_home]]
|
|
1206
|
+
elsif m[:team_away]
|
|
1207
|
+
[:TEAM_AWAY, m[:team_away]]
|
|
1208
|
+
elsif m[:team_neutral]
|
|
1209
|
+
[:TEAM_NEUTRAL, m[:team_neutral]]
|
|
1210
|
+
|
|
1211
|
+
elsif m[:attendance]
|
|
1212
|
+
att = {}
|
|
1213
|
+
att[:value] = m[:value].gsub( '_', '' ).to_i(10)
|
|
1214
|
+
## note - for token id use INLINE_ATTENDANCE (ATTENDANCE in use for prop!!!)
|
|
1215
|
+
[:INLINE_ATTENDANCE, [m[:attendance], att ]]
|
|
741
1216
|
elsif m[:note]
|
|
742
1217
|
### todo/check:
|
|
743
1218
|
## use value hash - why? why not? or simplify to:
|
|
744
1219
|
## [:NOTE, [m[:note], {note: m[:note] } ]]
|
|
745
1220
|
[:NOTE, m[:note]]
|
|
746
|
-
elsif m[:score_note]
|
|
747
|
-
[:SCORE_NOTE, m[:score_note]]
|
|
748
1221
|
elsif m[:time]
|
|
749
|
-
|
|
750
|
-
### 12.40 => 12:40
|
|
751
|
-
## 12h40 => 12:40 etc.
|
|
752
|
-
## keep string (no time-only type in ruby)
|
|
753
|
-
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
|
754
|
-
minute = m[:minute].to_i(10)
|
|
755
|
-
## check if valid - 0:00 - 24:00
|
|
756
|
-
## check if 24:00 possible? or only 0:00 (23:59)
|
|
757
|
-
if (hour >= 0 && hour <= 24) &&
|
|
758
|
-
(minute >=0 && minute <= 59)
|
|
759
|
-
## note - for debugging keep (pass along) "literal" time
|
|
760
|
-
## might use/add support for am/pm later
|
|
761
|
-
[:TIME, [m[:time], {h:hour,m:minute}]]
|
|
762
|
-
else
|
|
763
|
-
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
|
764
|
-
end
|
|
1222
|
+
[:TIME, [m[:time], _build_time(m)]]
|
|
765
1223
|
elsif m[:date]
|
|
766
|
-
date
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
##
|
|
795
|
-
|
|
796
|
-
|
|
1224
|
+
[:DATE, [m[:date], _build_date(m)]]
|
|
1225
|
+
elsif m[:date_legs]
|
|
1226
|
+
[:DATE_LEGS, [m[:date_legs], _build_date_legs(m)]]
|
|
1227
|
+
elsif m[:score_team]
|
|
1228
|
+
[:SCORE_TEAM, [m[:score_team], _build_score_team(m)]]
|
|
1229
|
+
elsif m[:score_team_pen]
|
|
1230
|
+
[:SCORE_TEAM_PEN, [m[:score_team_pen], _build_score_team_pen(m)]]
|
|
1231
|
+
elsif m[:score_team_num]
|
|
1232
|
+
[:SCORE_TEAM_NUM, [m[:score_team_num], _build_score_team_num(m)]]
|
|
1233
|
+
elsif m[:score_legs]
|
|
1234
|
+
legs = {}
|
|
1235
|
+
|
|
1236
|
+
### leg1
|
|
1237
|
+
score = {}
|
|
1238
|
+
score[:ft] = [m[:leg1_ft1].to_i(10),
|
|
1239
|
+
m[:leg1_ft2].to_i(10)]
|
|
1240
|
+
legs['leg1'] = score
|
|
1241
|
+
|
|
1242
|
+
### leg2
|
|
1243
|
+
score = {}
|
|
1244
|
+
score[:ft] = [m[:leg2_ft1].to_i(10),
|
|
1245
|
+
m[:leg2_ft2].to_i(10)] if m[:leg2_ft1] && m[:leg2_ft2]
|
|
1246
|
+
score[:et] = [m[:leg2_et1].to_i(10),
|
|
1247
|
+
m[:leg2_et2].to_i(10)] if m[:leg2_et1] && m[:leg2_et2]
|
|
1248
|
+
score[:p] = [m[:leg2_p1].to_i(10),
|
|
1249
|
+
m[:leg2_p2].to_i(10)] if m[:leg2_p1] && m[:leg2_p2]
|
|
1250
|
+
legs['leg2'] = score
|
|
1251
|
+
|
|
1252
|
+
## check for (opt) aggregate - keep on "top-level"
|
|
1253
|
+
legs[:agg] = [m[:agg1].to_i(10),
|
|
1254
|
+
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1255
|
+
legs[:away] = true if m[:away]
|
|
1256
|
+
|
|
1257
|
+
## note - for debugging keep (pass along) "literal" score
|
|
1258
|
+
[:SCORE_LEGS, [m[:score_legs], legs]]
|
|
1259
|
+
elsif m[:score_full]
|
|
1260
|
+
score = {}
|
|
1261
|
+
score[:p] = [m[:p1].to_i(10),
|
|
1262
|
+
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
|
1263
|
+
score[:et] = [m[:et1].to_i(10),
|
|
1264
|
+
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
|
1265
|
+
score[:ft] = [m[:ft1].to_i(10),
|
|
1266
|
+
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
|
1267
|
+
score[:ht] = [m[:ht1].to_i(10),
|
|
1268
|
+
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
|
1269
|
+
score[:agg] = [m[:agg1].to_i(10),
|
|
1270
|
+
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1271
|
+
|
|
1272
|
+
if m[:away1] && m[:away2]
|
|
1273
|
+
score[:away] = [m[:away1].to_i(10),
|
|
1274
|
+
m[:away2].to_i(10)]
|
|
1275
|
+
elsif m[:away] ## fallback if no away score; check away flag
|
|
1276
|
+
score[:away] = true
|
|
1277
|
+
end
|
|
1278
|
+
|
|
1279
|
+
## add golden/silver flags
|
|
1280
|
+
score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
|
|
1281
|
+
score[:silver] = true if m[:aetsg] ## silver goal (sg)
|
|
1282
|
+
|
|
1283
|
+
## note - for debugging keep (pass along) "literal" score
|
|
1284
|
+
[:SCORE_FULL, [m[:score_full], score]]
|
|
1285
|
+
elsif m[:score_fuller]
|
|
797
1286
|
score = {}
|
|
798
|
-
## check for pen
|
|
799
1287
|
score[:p] = [m[:p1].to_i(10),
|
|
800
1288
|
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
|
801
1289
|
score[:et] = [m[:et1].to_i(10),
|
|
@@ -804,18 +1292,85 @@ def _tokenize_line( line )
|
|
|
804
1292
|
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
|
805
1293
|
score[:ht] = [m[:ht1].to_i(10),
|
|
806
1294
|
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
|
1295
|
+
score[:agg] = [m[:agg1].to_i(10),
|
|
1296
|
+
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1297
|
+
if m[:away1] && m[:away2]
|
|
1298
|
+
score[:away] = [m[:away1].to_i(10),
|
|
1299
|
+
m[:away2].to_i(10)]
|
|
1300
|
+
elsif m[:away] ## fallback if no away score; check away flag
|
|
1301
|
+
score[:away] = true
|
|
1302
|
+
end
|
|
1303
|
+
|
|
1304
|
+
## add aet flag true/false
|
|
1305
|
+
# score[:aet] = true if m[:aet] || m[:aetgg] || m[:aetsg]
|
|
1306
|
+
|
|
1307
|
+
## add golden/silver flags
|
|
1308
|
+
score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
|
|
1309
|
+
score[:silver] = true if m[:aetsg] ## silver goal (sg)
|
|
807
1310
|
|
|
808
1311
|
## note - for debugging keep (pass along) "literal" score
|
|
809
|
-
[:
|
|
1312
|
+
[:SCORE_FULLER, [m[:score_fuller], score]]
|
|
1313
|
+
elsif m[:score_fuller_more]
|
|
1314
|
+
## SCORE + SCORE_FULLER_MORE
|
|
1315
|
+
## note - after extra-time (aet) or full-time (ft)
|
|
1316
|
+
## score may be present in SCORE!!!
|
|
1317
|
+
score = {}
|
|
1318
|
+
score[:p] = [m[:p1].to_i(10),
|
|
1319
|
+
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
|
1320
|
+
score[:et] = [m[:et1].to_i(10),
|
|
1321
|
+
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
|
1322
|
+
score[:ft] = [m[:ft1].to_i(10),
|
|
1323
|
+
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
|
1324
|
+
score[:ht] = [m[:ht1].to_i(10),
|
|
1325
|
+
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
|
1326
|
+
score[:agg] = [m[:agg1].to_i(10),
|
|
1327
|
+
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1328
|
+
if m[:away1] && m[:away2]
|
|
1329
|
+
score[:away] = [m[:away1].to_i(10),
|
|
1330
|
+
m[:away2].to_i(10)]
|
|
1331
|
+
elsif m[:away] ## fallback if no away score; check away flag
|
|
1332
|
+
score[:away] = true
|
|
1333
|
+
end
|
|
1334
|
+
|
|
1335
|
+
## add flag in score for et/ft/ht
|
|
1336
|
+
score[:score] = 'et' if m[:aet] || m[:aetgg] || m[:aetsg]
|
|
1337
|
+
score[:score] = 'ft' if m[:ft]
|
|
1338
|
+
score[:score] = 'ht' if m[:ht]
|
|
1339
|
+
|
|
1340
|
+
## add golden/silver flags
|
|
1341
|
+
score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
|
|
1342
|
+
score[:silver] = true if m[:aetsg] ## silver goal (sg)
|
|
1343
|
+
|
|
1344
|
+
## note - for debugging keep (pass along) "literal" score
|
|
1345
|
+
[:SCORE_FULLER_MORE, [m[:score_fuller_more], score]]
|
|
810
1346
|
elsif m[:score]
|
|
811
1347
|
score = {}
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
##
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
1348
|
+
## note - score is "generic"
|
|
1349
|
+
## might be full-time (ft) or
|
|
1350
|
+
## after extra-time (aet) or such
|
|
1351
|
+
## or even undecided/unknown
|
|
1352
|
+
## thus, use score1/score2 and NOT ft1/ft2
|
|
1353
|
+
score[:score] = [m[:score1].to_i(10),
|
|
1354
|
+
m[:score2].to_i(10)]
|
|
1355
|
+
## note - for debugging keep (pass along) "literal" score
|
|
818
1356
|
[:SCORE, [m[:score], score]]
|
|
1357
|
+
elsif m[:score_awd] ## score awarded (awd/awd.)
|
|
1358
|
+
score = {}
|
|
1359
|
+
### note - use "generic" score for now
|
|
1360
|
+
## to match A 3-0 B [awarded] etc.
|
|
1361
|
+
score[:score] = [m[:score1].to_i(10),
|
|
1362
|
+
m[:score2].to_i(10)]
|
|
1363
|
+
## add score[:awarded] = true ???
|
|
1364
|
+
## or only use match status to avoid duplicate?
|
|
1365
|
+
[:SCORE_AWD, [m[:score_awd], score]]
|
|
1366
|
+
elsif m[:score_abd] ## score abandonded (abd/abd.)
|
|
1367
|
+
score = {}
|
|
1368
|
+
### note - use "generic" score for now
|
|
1369
|
+
score[:score] = [m[:score1].to_i(10),
|
|
1370
|
+
m[:score2].to_i(10)]
|
|
1371
|
+
## add score[:awarded] = true ???
|
|
1372
|
+
## or only use match status to avoid duplicate?
|
|
1373
|
+
[:SCORE_ABD, [m[:score_abd], score]]
|
|
819
1374
|
elsif m[:minute]
|
|
820
1375
|
minute = {}
|
|
821
1376
|
minute[:m] = m[:value].to_i(10)
|
|
@@ -833,6 +1388,7 @@ def _tokenize_line( line )
|
|
|
833
1388
|
when '@' ## enter geo mode
|
|
834
1389
|
puts " ENTER GEO_RE MODE" if debug?
|
|
835
1390
|
@re = GEO_RE
|
|
1391
|
+
geo_count = 0
|
|
836
1392
|
[:'@']
|
|
837
1393
|
when ',' then [:',']
|
|
838
1394
|
when ';' then [:';']
|
|
@@ -840,10 +1396,14 @@ def _tokenize_line( line )
|
|
|
840
1396
|
when '|' then [:'|']
|
|
841
1397
|
when '[' then [:'[']
|
|
842
1398
|
when ']' then [:']']
|
|
843
|
-
when '-' then [:'-']
|
|
844
|
-
when '
|
|
845
|
-
|
|
846
|
-
|
|
1399
|
+
when '-' then [:'-']
|
|
1400
|
+
when '(' ## enter goal scorer mode on "free-floating" open paranthesis!!!
|
|
1401
|
+
puts " ENTER GOAL_RE MODE" if debug?
|
|
1402
|
+
@re = GOAL_RE
|
|
1403
|
+
## note - eat-up ( for now; do NOT pass along as token
|
|
1404
|
+
## pass along "virutal" INLINE GOALS - why? why not?
|
|
1405
|
+
[:INLINE_GOALS, "<|INLINE_GOALS|>"]
|
|
1406
|
+
when ')' then [:')']
|
|
847
1407
|
else
|
|
848
1408
|
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
849
1409
|
nil ## ignore others (e.g. brackets [])
|
|
@@ -884,21 +1444,24 @@ def _tokenize_line( line )
|
|
|
884
1444
|
end
|
|
885
1445
|
|
|
886
1446
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
1447
|
+
# if @re == GOAL_RE ### ALWAYS switch back to top level mode
|
|
1448
|
+
# puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
1449
|
+
# @re = RE
|
|
1450
|
+
# end
|
|
891
1451
|
|
|
892
1452
|
if @re == GEO_RE ### ALWAYS switch back to top level mode
|
|
893
1453
|
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
894
1454
|
@re = RE
|
|
895
1455
|
end
|
|
1456
|
+
|
|
1457
|
+
@re = RE if @re == GROUP_DEF_RE ### ALWAYS switch back to top level mode
|
|
1458
|
+
@re = RE if @re == ROUND_DEF_RE
|
|
896
1459
|
|
|
897
1460
|
##
|
|
898
1461
|
## if in prop mode continue if last token is [,-]
|
|
899
1462
|
## otherwise change back to "standard" mode
|
|
900
1463
|
if @re == PROP_RE || @re == PROP_CARDS_RE ||
|
|
901
|
-
@re ==
|
|
1464
|
+
@re == PROP_PENALTIES_RE ||
|
|
902
1465
|
@re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
|
|
903
1466
|
if [:',', :'-', :';'].include?( tokens[-1][0] )
|
|
904
1467
|
## continue/stay in PROP_RE mode
|