sportdb-parser 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +17 -4
- data/lib/sportdb/parser/lexer-on_goal.rb +172 -0
- data/lib/sportdb/parser/lexer-on_group_def.rb +31 -0
- data/lib/sportdb/parser/lexer-on_prop_lineup.rb +79 -0
- data/lib/sportdb/parser/lexer-on_prop_misc.rb +110 -0
- data/lib/sportdb/parser/lexer-on_prop_penalties.rb +40 -0
- data/lib/sportdb/parser/lexer-on_round_def.rb +37 -0
- data/lib/sportdb/parser/lexer-on_top.rb +125 -0
- data/lib/sportdb/parser/lexer-prep_doc.rb +131 -0
- data/lib/sportdb/parser/lexer-prep_line.rb +63 -0
- data/lib/sportdb/parser/lexer-tokenize.rb +449 -0
- data/lib/sportdb/parser/lexer.rb +133 -1363
- data/lib/sportdb/parser/lexer_buffer.rb +8 -37
- data/lib/sportdb/parser/lexer_token.rb +126 -0
- data/lib/sportdb/parser/parser.rb +1104 -1403
- data/lib/sportdb/parser/racc_parser.rb +36 -32
- data/lib/sportdb/parser/racc_tree.rb +65 -98
- data/lib/sportdb/parser/token-date--helpers.rb +130 -0
- data/lib/sportdb/parser/token-date--names.rb +108 -0
- data/lib/sportdb/parser/token-date.rb +20 -192
- data/lib/sportdb/parser/token-date_duration.rb +8 -27
- data/lib/sportdb/parser/token-geo.rb +16 -16
- data/lib/sportdb/parser/token-goals--helpers.rb +114 -0
- data/lib/sportdb/parser/token-goals.rb +103 -249
- data/lib/sportdb/parser/token-group.rb +8 -22
- data/lib/sportdb/parser/token-prop.rb +138 -124
- data/lib/sportdb/parser/token-prop_name.rb +48 -39
- data/lib/sportdb/parser/token-round.rb +21 -35
- data/lib/sportdb/parser/token-score--helpers.rb +189 -0
- data/lib/sportdb/parser/token-score.rb +9 -393
- data/lib/sportdb/parser/token-score_full.rb +331 -0
- data/lib/sportdb/parser/token-status.rb +44 -46
- data/lib/sportdb/parser/token-status_inline.rb +112 -0
- data/lib/sportdb/parser/token-text.rb +41 -31
- data/lib/sportdb/parser/token-time.rb +29 -26
- data/lib/sportdb/parser/token.rb +58 -159
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +45 -17
- metadata +19 -6
- data/lib/sportdb/parser/blocktxt.rb +0 -99
- data/lib/sportdb/parser/lexer_tty.rb +0 -111
- data/lib/sportdb/parser/token-table.rb +0 -149
- data/lib/sportdb/parser/token_helpers.rb +0 -92
data/lib/sportdb/parser/lexer.rb
CHANGED
|
@@ -7,6 +7,9 @@ class Lexer
|
|
|
7
7
|
def log( msg )
|
|
8
8
|
## append msg to ./logs.txt
|
|
9
9
|
## use ./errors.txt - why? why not?
|
|
10
|
+
##
|
|
11
|
+
## change to ./logs_lexer.txt or such - why? why not?
|
|
12
|
+
## auto-add/prepend [Lexer] and timestamp!!! to msg - why? why not?
|
|
10
13
|
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
|
11
14
|
f.write( msg )
|
|
12
15
|
f.write( "\n" )
|
|
@@ -14,387 +17,125 @@ def log( msg )
|
|
|
14
17
|
end
|
|
15
18
|
|
|
16
19
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
20
|
+
def _trace( *args )
|
|
21
|
+
if debug?
|
|
22
|
+
print "[DEBUG] Lexer -- "
|
|
23
|
+
args.each { |arg| puts args }
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def _warn( *args )
|
|
28
|
+
print "!! [WARN] Lexer -- "
|
|
29
|
+
args.each { |arg| puts args }
|
|
30
|
+
end
|
|
29
31
|
|
|
32
|
+
def _info( *args )
|
|
33
|
+
print "[INFO] Lexer -- "
|
|
34
|
+
args.each { |arg| puts args }
|
|
35
|
+
end
|
|
30
36
|
|
|
31
37
|
|
|
32
38
|
def debug?() @debug == true; end
|
|
33
39
|
|
|
34
|
-
def initialize( lines, debug: false )
|
|
35
|
-
raise ArgumentError, "(string) text expected for lexer; got #{lines.class.name}" unless lines.is_a?(String)
|
|
36
|
-
|
|
37
|
-
@debug = debug
|
|
38
|
-
@txt = lines
|
|
39
|
-
end
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
HTML_COMMENT_RE = %r{ <!--
|
|
43
|
-
.*? ## note - use non-greedy/lazy *? match
|
|
44
|
-
-->
|
|
45
|
-
}xm ## note - turn on multi-line match (for dot (.))
|
|
46
42
|
|
|
47
43
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
## what about comments (e.g. #)?
|
|
51
|
-
## todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
|
|
52
|
-
PREPROC_BLOCK_RE = %r{ \[
|
|
53
|
-
[^\[\]\#]*? ## note - use non-greedy/lazy *? match
|
|
54
|
-
\]
|
|
55
|
-
}xm ## note - turn on multi-line match (for dot(.))
|
|
44
|
+
def initialize( txt, debug: false )
|
|
45
|
+
raise ArgumentError, "text as string expected for lexer; got #{txt.class.name}" unless txt.is_a?(String)
|
|
56
46
|
|
|
47
|
+
@txt = txt
|
|
48
|
+
@debug = debug
|
|
49
|
+
end
|
|
57
50
|
|
|
58
|
-
##
|
|
59
|
-
## check for "literal" (multi-line) note blocks
|
|
60
|
-
## eg. nb: or note:
|
|
61
|
-
## space required after double colon - why? why not?
|
|
62
|
-
PREPROC_NOTA_BENE_RE = %r{
|
|
63
|
-
^
|
|
64
|
-
[ ]* (?: nb | note) [ ]* : [ ]+
|
|
65
|
-
.+? ## non-greedy
|
|
66
|
-
|
|
67
|
-
## positive lookahead
|
|
68
|
-
## note - must end with blank line or end-of-file/document
|
|
69
|
-
## note - do NOT eat-up trailing hrule (---)
|
|
70
|
-
(?= (?: \n [ ]* -{3,} [ ]*)?
|
|
71
|
-
\n[ ]*\n
|
|
72
|
-
| \z
|
|
73
|
-
)
|
|
74
|
-
}xim
|
|
75
51
|
|
|
76
|
-
##
|
|
77
|
-
## replace "escaped" newline with non-newline char e.g. '↵'
|
|
78
|
-
LINE_CONTINUATION_RE = %r{
|
|
79
|
-
\\[ ]* \n
|
|
80
|
-
}x
|
|
81
52
|
|
|
82
53
|
|
|
54
|
+
def tokenize_with_errors
|
|
83
55
|
|
|
84
|
-
|
|
85
|
-
##
|
|
86
|
-
## e.g # teletype: true or TELETYPE: TRUE
|
|
87
|
-
## tty/teletype
|
|
56
|
+
tokens_by_line = [] ## note: add tokens line-by-line (flatten later)
|
|
57
|
+
errors = [] ## keep a list of errors - why? why not?
|
|
88
58
|
|
|
89
|
-
MAGIC_COMMENT_RE = %r{ \A
|
|
90
|
-
[ ]* ## optional leading spaces
|
|
91
|
-
\#+ ## note - allow ##,###, etc. too
|
|
92
|
-
[ ]* ## optional spaces
|
|
93
|
-
(?<magic_comment_key> tty | teletype )
|
|
94
|
-
[ ]* ## optional spaces
|
|
95
|
-
:
|
|
96
|
-
[ ]* ## optional spaces
|
|
97
|
-
(?<magic_comment_value> true | false )
|
|
98
|
-
[ ]* ## optional trailing spaces
|
|
99
|
-
\z
|
|
100
|
-
}ix
|
|
101
59
|
|
|
60
|
+
txt = _prep_doc( @txt )
|
|
102
61
|
|
|
103
62
|
|
|
104
63
|
|
|
64
|
+
####
|
|
65
|
+
## quick hack - keep re state/mode between tokenize calls!!!
|
|
66
|
+
@re ||= RE ## note - switch between RE & INSIDE_RE
|
|
105
67
|
|
|
68
|
+
lineno = 0
|
|
69
|
+
txt.each_line do |line|
|
|
70
|
+
lineno += 1
|
|
106
71
|
|
|
107
|
-
|
|
72
|
+
## todo - "inlined virtual/collapsed/folded newlines"
|
|
73
|
+
## check for "↵" !!!
|
|
74
|
+
## and add to lineno
|
|
108
75
|
|
|
109
|
-
####
|
|
110
|
-
## flags / modes
|
|
111
|
-
@teletype = false # use magic comment - tty/teletype: true
|
|
112
76
|
|
|
77
|
+
## note - KEEP leading spaces for indent
|
|
78
|
+
## use rstrip (NOT left/leading & right/trainling strip) only!!
|
|
79
|
+
## note - remove/strip trailing newline (and optional spaces)!!!
|
|
80
|
+
## trailing whitespace may incl. \n or \r\n!!!
|
|
81
|
+
line = line.rstrip
|
|
113
82
|
|
|
114
83
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
## keep leading spaces (indent) - why?
|
|
122
|
-
##
|
|
123
|
-
## note - KEEP empty lines (get turned into BLANK token!!!!)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
## "universal" newlines
|
|
127
|
-
## replace all windows-style cr+lf (\r\n) to lf (\n) only
|
|
128
|
-
txt = @txt.gsub( "\r\n", "\n" )
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
###
|
|
133
|
-
## quick hack for now
|
|
134
|
-
## remove html-style comments <!-- -->
|
|
135
|
-
## (incl. multi-line) with two spaces
|
|
136
|
-
## will mess-up lineno tracking!!!
|
|
137
|
-
## fix later to have function lineno & colno!!!
|
|
138
|
-
txt = @txt.gsub( HTML_COMMENT_RE ) do |m|
|
|
139
|
-
puts " [debug] preproc html comment:"
|
|
140
|
-
puts m
|
|
141
|
-
' '
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
=begin
|
|
146
|
-
##
|
|
147
|
-
## todo/fix - add a command line switch/option for auto-format fixes !!!
|
|
148
|
-
## quick hack - remove later
|
|
149
|
-
## auto-convert "old" legacy round markers (»)
|
|
150
|
-
txt = txt.gsub( %r{^ [ ]*
|
|
151
|
-
»
|
|
152
|
-
(?= [ ]+) ## require one trailing space for now!!
|
|
153
|
-
}ix ) do |_|
|
|
154
|
-
puts "!! WARN - auto-fix format; replacing old (alternate/legacy) round marker (»)"
|
|
155
|
-
'▪'
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
### 16.00 => 16:00
|
|
160
|
-
## todo/check - use space for positive lookbehind & ahead
|
|
161
|
-
## (instead of \b) - why? why not?
|
|
162
|
-
## note - check for/exclude 12.12. date in match
|
|
163
|
-
## use negative lookahead
|
|
164
|
-
## check for 12.12.94
|
|
165
|
-
## use positive lookbehind !!!
|
|
166
|
-
## must be space, comma or begin-of-line [ ,]|^
|
|
167
|
-
## or use negative lookbehind
|
|
168
|
-
## must NOT be dot
|
|
169
|
-
txt = txt.gsub( %r{
|
|
170
|
-
## check NEGATIVE lookbehind
|
|
171
|
-
(?<! [.]) ## do NOT match 12.94 in 12.12.94
|
|
172
|
-
\b
|
|
173
|
-
(?<h>\d{1,2})
|
|
174
|
-
\.
|
|
175
|
-
(?<m>\d{2})
|
|
176
|
-
\b
|
|
177
|
-
(?! [.] ) ## do NOT match 12.12.
|
|
178
|
-
}ix ) do |_|
|
|
179
|
-
m = $~ ## is $LAST_MATCH_DATA
|
|
180
|
-
puts "!! WARN - auto-fix format; replacing old (alternate/legacy) time format #{m[0]}"
|
|
181
|
-
"#{m[:h]}:#{m[:m]}" ## '\1:\2'
|
|
182
|
-
end
|
|
183
|
-
=end
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
###
|
|
189
|
-
## add more "native" multi-line comment-styles
|
|
190
|
-
## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >>
|
|
191
|
-
## or such - why? why not?
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
|
|
195
|
-
if m.include?( "\n" ) ## check for newlines (\n) and replace
|
|
196
|
-
puts " [debug] preproc (multi-line) note/nota bene block:"
|
|
197
|
-
puts m
|
|
198
|
-
## todo/check: replace with two spaces insead of ↵ - why? why not?
|
|
199
|
-
m.gsub( "\n", '↵' )
|
|
200
|
-
else
|
|
201
|
-
m
|
|
202
|
-
end
|
|
203
|
-
end
|
|
84
|
+
### skip comments
|
|
85
|
+
## todo/check - change to blank line
|
|
86
|
+
## to keep lineno (closer to orginal) - why? why not?
|
|
87
|
+
next if line.match?(/\A [ ]* ## optional leading space(s)
|
|
88
|
+
\#
|
|
89
|
+
/x )
|
|
204
90
|
|
|
91
|
+
## strip (inline) end-of-line comments (from line)
|
|
92
|
+
## check/discuss: make - inline comment require trailing space
|
|
93
|
+
## e.g. #1 vs # 1 - why? why not?
|
|
94
|
+
line = line.sub( / [ ]* ## (eat-up) optional leading space(s)
|
|
95
|
+
\#{1,}.*?
|
|
96
|
+
\z
|
|
97
|
+
/x, '' )
|
|
205
98
|
|
|
206
|
-
##
|
|
207
|
-
## e.g. used in (multi-line) TableNote
|
|
208
|
-
## 1.SOUTH KOREA 6 5 1 0 22- 1 16 [0-0]
|
|
209
|
-
## 2.LEBANON 6 3 1 2 11- 8 10 [0-2, 0-0]
|
|
210
|
-
## 3.Turkmenistan 6 3 0 3 8-11 9 [3-1]
|
|
211
|
-
## 4.Sri Lanka 6 0 0 6 2-23 0 [0-1]
|
|
212
|
-
## -.North Korea [withdrew after playing 5 matches due to safety concerns in
|
|
213
|
-
## connection with the Covid-19 pandemic; all results annulled]
|
|
214
|
-
##
|
|
215
|
-
## note - no longer used for now
|
|
216
|
-
## enclose multi-line notes in []
|
|
217
|
-
## removes need for line continuation for now
|
|
218
|
-
|
|
219
|
-
##
|
|
220
|
-
## txt = txt.gsub( LINE_CONTINUATION_RE ) do |_|
|
|
221
|
-
## puts " [debug] preproc line continuation"
|
|
222
|
-
## ## todo/check: replace with two spaces insead of ↵ - why? why not?
|
|
223
|
-
## '↵'
|
|
224
|
-
## end
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
#####
|
|
229
|
-
## (another) quick hack for now
|
|
230
|
-
## turn multi-line note blocks into
|
|
231
|
-
## single-line note blocks
|
|
232
|
-
## by changing newline (\n) to ⏎ (unicode U+23CE)
|
|
233
|
-
## or why not to ___ ?
|
|
234
|
-
##
|
|
235
|
-
## unicode options for return/arrows:
|
|
236
|
-
## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
|
|
237
|
-
## This is the most common "carriage return" symbol.
|
|
238
|
-
## - ⏎ (U+23CE): Return Symbol.
|
|
239
|
-
## Specifically designated as the keyboard's "Return" key symbol,
|
|
240
|
-
## often used in user interfaces.
|
|
241
|
-
|
|
242
|
-
txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
|
|
243
|
-
if m.include?( "\n" ) ## check for newlines (\n) and replace
|
|
244
|
-
puts " [debug] preproc (multi-line) block:"
|
|
245
|
-
puts m
|
|
246
|
-
## todo/check: replace with two spaces insead of ↵ - why? why not?
|
|
247
|
-
m.gsub( "\n", '↵' )
|
|
248
|
-
else
|
|
249
|
-
m
|
|
250
|
-
end
|
|
251
|
-
end
|
|
252
99
|
|
|
100
|
+
####
|
|
101
|
+
# support __END__ marker to cut-off input
|
|
102
|
+
break if line.match?( /\A [ ]* ## optional leading space(s)
|
|
103
|
+
__END__
|
|
104
|
+
\z
|
|
105
|
+
/x )
|
|
253
106
|
|
|
254
|
-
####
|
|
255
|
-
## quick hack - keep re state/mode between tokenize calls!!!
|
|
256
|
-
@re ||= RE ## note - switch between RE & INSIDE_RE
|
|
257
|
-
|
|
258
107
|
|
|
259
|
-
txt.each_line do |line|
|
|
260
|
-
## line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!!
|
|
261
|
-
line = line.strip ## note - strip leading AND trailing whitespaces
|
|
262
|
-
## note - trailing whitespace may incl. \n or \r\n!!!
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
##
|
|
266
|
-
###
|
|
267
|
-
## check for magic comments
|
|
268
|
-
## e.g # teletype: true or TELETYPE: TRUE
|
|
269
|
-
## tty/teletype
|
|
270
|
-
|
|
271
|
-
if line.start_with?('#') ### skip comments (& check magic comments!!)
|
|
272
|
-
|
|
273
|
-
if (m = MAGIC_COMMENT_RE.match(line))
|
|
274
|
-
magic_comment_key = m[:magic_comment_key].downcase
|
|
275
|
-
magic_comment_value = m[:magic_comment_value].downcase
|
|
276
|
-
|
|
277
|
-
## turn on teletype mode
|
|
278
|
-
## e.g. tty: true or teletype: true
|
|
279
|
-
if ['tty', 'teletype'].include?( magic_comment_key ) &&
|
|
280
|
-
['true'].include?( magic_comment_value )
|
|
281
|
-
puts " magic comment - turn on teletype (tty) mode"
|
|
282
|
-
@teletype = true
|
|
283
|
-
end
|
|
284
|
-
end
|
|
285
|
-
|
|
286
|
-
next
|
|
287
|
-
end
|
|
288
108
|
|
|
289
|
-
|
|
109
|
+
## auto-fixes line-by-line (e.g. check for tabs, smart quotes, etc.)
|
|
110
|
+
line = _prep_line( line )
|
|
290
111
|
|
|
291
112
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
break if line.strip == '__END__'
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
##
|
|
299
|
-
## first check for tabs
|
|
300
|
-
## add error/warn
|
|
301
|
-
## for auto-fix - replace tabs with two spaces
|
|
302
|
-
|
|
303
|
-
line = line.gsub( "\t" ) do |_|
|
|
304
|
-
## report error here
|
|
305
|
-
## todo/add error here
|
|
306
|
-
puts "!! WARN - auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}"
|
|
307
|
-
" " ## replace with two spaces
|
|
308
|
-
end
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
## U+00A0 (160) -- non-breaking space (unicode)
|
|
312
|
-
line = line.gsub( "\u00A0" ) do |uni|
|
|
313
|
-
## report error here
|
|
314
|
-
## todo/add error here
|
|
315
|
-
puts "!! WARN - auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}"
|
|
316
|
-
" " ## replace with space
|
|
317
|
-
end
|
|
318
|
-
|
|
319
|
-
###
|
|
320
|
-
## todo/fix - print unicode numbers for [–−]
|
|
321
|
-
## different candidates to differentiate and document!!!
|
|
322
|
-
## – => U+2013 (8211) -- En Dash (unicode)
|
|
323
|
-
## − => U+2212 (8722) -- Minus Sign (unicode)
|
|
324
|
-
line = line.gsub( /[–−]/ ) do |uni|
|
|
325
|
-
## report error here
|
|
326
|
-
## todo/add error here
|
|
327
|
-
puts "!! WARN - auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}"
|
|
328
|
-
'-' ## replace with ascii dash (-)
|
|
329
|
-
end
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
puts "line: >#{line}<" if debug?
|
|
113
|
+
_trace( "line #{lineno}: >#{line}<" )
|
|
114
|
+
|
|
334
115
|
|
|
335
116
|
######
|
|
336
117
|
### special case for empty line (aka BLANK)
|
|
337
118
|
if line.empty?
|
|
338
119
|
## note - blank always resets parser mode to std/top-level!!!
|
|
339
120
|
@re = RE
|
|
340
|
-
tokens_by_line << [
|
|
121
|
+
tokens_by_line << [Token.virtual(:BLANK, lineno: lineno)]
|
|
341
122
|
elsif (m = HEADING_RE.match(line))
|
|
342
123
|
## note - heading always resets parser mode to std/top-level!!!
|
|
343
124
|
@re = RE
|
|
344
|
-
|
|
125
|
+
_trace( 'HEADING' )
|
|
345
126
|
## note - derive heading level from no of (leading) markers
|
|
346
127
|
## e.g. = is 1, == is 2, == is 3, etc.
|
|
347
|
-
heading_level = m[:heading_marker].size
|
|
348
|
-
tokens_by_line << [
|
|
128
|
+
heading_level = m[:heading_marker].size
|
|
129
|
+
tokens_by_line << [Token.new(:"H#{heading_level}", m[:heading], lineno: lineno)]
|
|
349
130
|
elsif (m = NOTA_BENE_RE.match(line))
|
|
350
131
|
## note - nota bene always resets parser mode to std/top-level!!!
|
|
351
132
|
@re = RE
|
|
352
|
-
tokens_by_line << [
|
|
353
|
-
elsif @re == RE && (m = TABLE_RE.match(line))
|
|
354
|
-
@re = TABLE_MORE_RE ## switch into table mode
|
|
355
|
-
if m[:table_heading]
|
|
356
|
-
tokens_by_line << [[:TABLE_HEADING, m[:table_heading]]]
|
|
357
|
-
else ## assume table (line) e.g. m[:table]
|
|
358
|
-
tokens_by_line << [[:TABLE_LINE, line]]
|
|
359
|
-
end
|
|
360
|
-
elsif @re == TABLE_MORE_RE
|
|
361
|
-
### todo/fix - check if no match and report/add error!!
|
|
362
|
-
## for now (ummatched) line gets auto-added as table line!!!
|
|
363
|
-
##
|
|
364
|
-
## note - MUST be followed by blank line (or nota bene/heading)
|
|
365
|
-
## to switch back into to top-level!!!!
|
|
366
|
-
m = TABLE_MORE_RE.match(line)
|
|
367
|
-
if m[:table_note]
|
|
368
|
-
tokens_by_line << [[:TABLE_NOTE, m[:table_note]]]
|
|
369
|
-
elsif m[:table_divider]
|
|
370
|
-
tokens_by_line << [[:TABLE_DIVIDER, m[:table_divider]]]
|
|
371
|
-
else ## assume table (line) e.g. m[:table]
|
|
372
|
-
tokens_by_line << [[:TABLE_LINE, line]]
|
|
373
|
-
end
|
|
374
|
-
elsif @re != TABLE_MORE_RE && (m = HRULER_RE.match(line))
|
|
375
|
-
## note - hruler (---)
|
|
376
|
-
## will only match if NOT in table mode!!!
|
|
377
|
-
## otherwise
|
|
378
|
-
## hruler always resets parser mode to std/top-level!!!
|
|
379
|
-
@re = RE
|
|
380
|
-
tokens_by_line << [[:HRULER, '<|HRULER|>']]
|
|
381
|
-
elsif @teletype && (@re == RE && IS_TTY_LINE_RE.match(line))
|
|
382
|
-
## try experimental TELETYPE (TTY) mode!!!
|
|
383
|
-
## note - turn on via magic comment e.g. tty/teletype: true
|
|
384
|
-
###
|
|
385
|
-
### move inside _tokenize_line - why? why not?
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
tokens_by_line << _tokenize_tty_line( line )
|
|
389
|
-
|
|
390
|
-
## note - dates such as
|
|
391
|
-
## APR 11 or 11 APR will trigger TELETYPE
|
|
392
|
-
### ## check letter
|
|
133
|
+
tokens_by_line << [Token.new(:NOTA_BENE, m[:nota_bene], lineno: lineno)]
|
|
393
134
|
else
|
|
394
135
|
|
|
395
|
-
more_tokens, more_errors = _tokenize_line( line )
|
|
396
|
-
|
|
397
|
-
tokens_by_line << more_tokens
|
|
136
|
+
more_tokens, more_errors = _tokenize_line( line, lineno )
|
|
137
|
+
|
|
138
|
+
tokens_by_line << more_tokens
|
|
398
139
|
errors += more_errors
|
|
399
140
|
end
|
|
400
141
|
end # each line
|
|
@@ -402,1084 +143,113 @@ def tokenize_with_errors
|
|
|
402
143
|
|
|
403
144
|
|
|
404
145
|
|
|
146
|
+
tokens_by_line = tokens_by_line.map do |tokens|
|
|
405
147
|
|
|
406
|
-
tokens_by_line = tokens_by_line.map do |tokens|
|
|
407
148
|
#################
|
|
408
|
-
## transform tokens (using simple patterns)
|
|
409
|
-
## to help along the (racc look ahead 1 - LA1) parser
|
|
149
|
+
## transform tokens (using simple patterns)
|
|
150
|
+
## to help along the (racc look ahead 1 - LA1) parser
|
|
410
151
|
nodes = []
|
|
411
152
|
|
|
412
153
|
buf = Tokens.new( tokens )
|
|
413
154
|
## pp buf
|
|
414
155
|
|
|
156
|
+
|
|
415
157
|
loop do
|
|
416
158
|
break if buf.eos?
|
|
417
159
|
|
|
418
160
|
if buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME
|
|
419
|
-
date = buf.next
|
|
420
|
-
time = buf.next
|
|
161
|
+
date = buf.next
|
|
162
|
+
time = buf.next
|
|
421
163
|
## puts "DATETIME:"
|
|
422
164
|
## pp date, time
|
|
165
|
+
|
|
423
166
|
## note: time value is { time: {} } or
|
|
424
167
|
## { time: {}, time_local {} }
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
nodes <<
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
168
|
+
text = date.text + ' ' + time.text, ## concat string of two tokens
|
|
169
|
+
value = { date: date.value }.merge( time.value )
|
|
170
|
+
|
|
171
|
+
nodes << Token.new(:DATETIME, text,
|
|
172
|
+
lineno: date.lineno,
|
|
173
|
+
offset: [date.offset[0],
|
|
174
|
+
time.offset[1]],
|
|
175
|
+
value: value )
|
|
176
|
+
### support date time with comma too - why? why not?
|
|
177
|
+
elsif buf.match?( :DATE, ',', :TIME )
|
|
178
|
+
date = buf.next
|
|
179
|
+
_ = buf.next ## ignore comma
|
|
180
|
+
time = buf.next
|
|
434
181
|
## puts "DATETIME:"
|
|
435
182
|
## pp date, time
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
nodes <<
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
val = [team + ' ' + score_team[0], ## concat string of two tokens
|
|
446
|
-
{ team: team }.merge( score_team[1] )
|
|
447
|
-
]
|
|
448
|
-
nodes << [:TEAMALT, val]
|
|
449
|
-
elsif buf.match?( :TEAM, :SCORE_TEAM_PEN )
|
|
450
|
-
team = buf.next[1]
|
|
451
|
-
score_team_pen = buf.next[1]
|
|
452
|
-
val = [team + ' ' + score_team_pen[0], ## concat string of two tokens
|
|
453
|
-
{ team: team }.merge( score_team_pen[1] )
|
|
454
|
-
]
|
|
455
|
-
nodes << [:TEAMALT_PEN, val]
|
|
456
|
-
elsif buf.match?( :TEAM, :SCORE_TEAM_NUM )
|
|
457
|
-
team = buf.next[1]
|
|
458
|
-
score_team_num = buf.next[1]
|
|
459
|
-
val = [team + ' ' + score_team_num[0], ## concat string of two tokens
|
|
460
|
-
{ team: team }.merge( score_team_num[1] )
|
|
461
|
-
]
|
|
462
|
-
nodes << [:TEAMALT_NUM, val]
|
|
463
|
-
elsif buf.match?( :GOAL_MINUTE, :',', :GOAL_MINUTE )
|
|
183
|
+
text = date.text + ', ' + time.text ## concat string of two tokens
|
|
184
|
+
value = { date: date.value }.merge( time.value )
|
|
185
|
+
|
|
186
|
+
nodes << Token.new(:DATETIME, text,
|
|
187
|
+
lineno: date.lineno,
|
|
188
|
+
offset: [date.offset[0],
|
|
189
|
+
time.offset[1]],
|
|
190
|
+
value: value )
|
|
191
|
+
elsif buf.match?( :GOAL_MINUTE, ',', :GOAL_MINUTE )
|
|
464
192
|
## note - only advance by two tokens!
|
|
465
193
|
## allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
|
|
466
|
-
##
|
|
194
|
+
##
|
|
467
195
|
## help parser with comma shift/reduce conflict
|
|
468
196
|
## change ',' to GOAL_MINUTE_SEP !!!
|
|
469
|
-
nodes << buf.next ## pass through goal_minute
|
|
470
|
-
|
|
197
|
+
nodes << buf.next ## pass through goal_minute
|
|
198
|
+
comma = buf.next ## eat-up goal_minute_sep a.k.a. comma (,)
|
|
471
199
|
## and replace with dedicated sep(arator)
|
|
472
|
-
nodes <<
|
|
473
|
-
|
|
474
|
-
|
|
200
|
+
nodes << Token.new( :GOAL_MINUTE_SEP,
|
|
201
|
+
comma.text,
|
|
202
|
+
lineno: comma.lineno,
|
|
203
|
+
offset: comma.offset,
|
|
204
|
+
value: comma.value)
|
|
205
|
+
elsif buf.match?( ',', :INLINE_ATTENDANCE )
|
|
206
|
+
## note - allow optional comma before inline attendance
|
|
475
207
|
## help parser with comma shift/reduce conflict
|
|
476
208
|
## change ',' to INLINE_ATTENDANCE_SEP !!!
|
|
477
|
-
|
|
478
|
-
_ = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,)
|
|
209
|
+
comma = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,)
|
|
479
210
|
## and replace with dedicated sep(arator)
|
|
480
|
-
nodes <<
|
|
211
|
+
nodes << Token.new(:INLINE_ATTENDANCE_SEP,
|
|
212
|
+
comma.text,
|
|
213
|
+
lineno: comma.lineno,
|
|
214
|
+
offset: comma.offset,
|
|
215
|
+
value: comma.value)
|
|
216
|
+
nodes << buf.next ## pass through inline_attendance
|
|
481
217
|
else
|
|
482
218
|
## pass through
|
|
483
219
|
nodes << buf.next
|
|
484
220
|
end
|
|
485
221
|
end # loop
|
|
486
|
-
nodes
|
|
222
|
+
nodes
|
|
487
223
|
end # map tokens_by_line
|
|
488
224
|
|
|
489
225
|
|
|
490
|
-
|
|
226
|
+
## puts "tokens_by_line:"
|
|
227
|
+
## pp tokens_by_line
|
|
228
|
+
|
|
491
229
|
|
|
492
230
|
## flatten tokens
|
|
493
231
|
tokens = []
|
|
494
|
-
tokens_by_line.each do |
|
|
232
|
+
tokens_by_line.each do |tok_line|
|
|
495
233
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
234
|
+
## if debug?
|
|
235
|
+
## pp tok_line
|
|
236
|
+
## end
|
|
499
237
|
|
|
238
|
+
tokens += tok_line
|
|
500
239
|
|
|
501
|
-
###############
|
|
502
|
-
## "hacky" (automagic) line merges (remove newline)
|
|
503
|
-
## if line start with @ - check if incl. teams
|
|
504
|
-
|
|
505
|
-
###
|
|
506
|
-
### quick merge lines hack
|
|
507
|
-
## if line starts with geo-marker token @
|
|
508
|
-
## check if line incl. TEAM
|
|
509
|
-
## if yes, leave alone
|
|
510
|
-
## otherwise merge line into previous line!!
|
|
511
|
-
## - todo/fix - handle in possibly in grammar!!!
|
|
512
|
-
## for now match_line CAN start with @ London
|
|
513
|
-
## resulting in parser conflict(s)!!!
|
|
514
|
-
## e.g.
|
|
515
|
-
## England v Scotland
|
|
516
|
-
## @ London
|
|
517
|
-
## =>
|
|
518
|
-
## England v Scotland @ London
|
|
519
|
-
##
|
|
520
|
-
|
|
521
|
-
##
|
|
522
|
-
## note/todo - if INDENT / SPACES get added
|
|
523
|
-
## adjust here
|
|
524
|
-
## tok[0][0] == :INDENT (or :SPACES) &&
|
|
525
|
-
## tok[1][0] == :'@'
|
|
526
|
-
|
|
527
|
-
if tok[0] && tok[0][0] == :'@'
|
|
528
|
-
team = tok.find { |t| t[0] == :TEAM }
|
|
529
|
-
if team
|
|
530
|
-
## do nothing - keep as is (assume match_line starting w/ @)
|
|
531
|
-
else
|
|
532
|
-
## no team(s) found in line
|
|
533
|
-
## remove last token (that is, NEWLINE)
|
|
534
|
-
## note - possibly is blank ?! keep blank
|
|
535
|
-
tokens.pop if tokens[-1][0] == :NEWLINE
|
|
536
|
-
end
|
|
537
|
-
end
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
tokens += tok
|
|
541
240
|
## auto-add newlines (unless BLANK!!)
|
|
542
|
-
|
|
241
|
+
unless tok_line[0] && tok_line[0].type == :BLANK
|
|
242
|
+
## note - reuse lineno from first token in line
|
|
243
|
+
## use last - why? why not?
|
|
244
|
+
tokens << Token.newline( lineno: tok_line[0].lineno )
|
|
245
|
+
end
|
|
543
246
|
end
|
|
544
247
|
|
|
545
248
|
[tokens,errors]
|
|
546
|
-
end # method tokenize_with_errors
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
def _tokenize_line( line )
|
|
552
|
-
tokens = []
|
|
553
|
-
errors = [] ## keep a list of errors - why? why not?
|
|
554
249
|
|
|
250
|
+
end # method tokenize_with_errors
|
|
555
251
|
|
|
556
|
-
pos = 0
|
|
557
|
-
## track last offsets - to report error on no match
|
|
558
|
-
## or no match in end of string
|
|
559
|
-
offsets = [0,0]
|
|
560
|
-
m = nil
|
|
561
|
-
|
|
562
|
-
## track number of geo text seen
|
|
563
|
-
## (use for - do NOT break on two spaces if no geo text seen yet!!)
|
|
564
|
-
geo_count = 0
|
|
565
|
-
|
|
566
|
-
####
|
|
567
|
-
## quick hack - keep re state/mode between tokenize calls!!!
|
|
568
|
-
@re ||= RE ## note - switch between RE & INSIDE_RE
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
if @re == RE ## top-level
|
|
572
|
-
### check for modes once (per line) here to speed-up parsing
|
|
573
|
-
### for now goals only possible for start of line!!
|
|
574
|
-
### fix - remove optional [] - why? why not?
|
|
575
|
-
|
|
576
|
-
####
|
|
577
|
-
## note - ord e.g. (45) for match number can only start a (match) line
|
|
578
|
-
## "inline" use NOT possible
|
|
579
|
-
## note - ord (for ordinal number!!!) e.g match number (1), (42), etc.
|
|
580
|
-
if (m = START_WITH_ORD.match(line))
|
|
581
|
-
## note - strip enclosing () and convert to integer
|
|
582
|
-
tokens << [:ORD, [m[:ord], { value: m[:value].to_i(10) } ]]
|
|
583
|
-
|
|
584
|
-
offsets = [m.begin(0), m.end(0)]
|
|
585
|
-
pos = offsets[1] ## update pos
|
|
586
|
-
elsif (m = START_WITH_YEAR.match(line))
|
|
587
|
-
## note - strip enclosing () and convert to integer
|
|
588
|
-
tokens << [:YEAR, m[:year].to_i(10)]
|
|
589
|
-
|
|
590
|
-
offsets = [m.begin(0), m.end(0)]
|
|
591
|
-
pos = offsets[1] ## update pos
|
|
592
|
-
|
|
593
|
-
###
|
|
594
|
-
## todo/fix - rename to START_GROUP_DEF_LINE_RE !!!!
|
|
595
|
-
elsif (m = GROUP_DEF_LINE_RE.match( line ))
|
|
596
|
-
puts " ENTER GROUP_DEF_RE MODE" if debug?
|
|
597
|
-
@re = GROUP_DEF_RE
|
|
598
|
-
|
|
599
|
-
tokens << [:GROUP_DEF, m[:group_def]]
|
|
600
|
-
|
|
601
|
-
offsets = [m.begin(0), m.end(0)]
|
|
602
|
-
pos = offsets[1] ## update pos
|
|
603
|
-
|
|
604
|
-
### todo/fix - rename to PROP_KEY_RE to START_WITH_PROP_KEY_RE !!!
|
|
605
|
-
elsif (m = PROP_KEY_RE.match( line ))
|
|
606
|
-
## start with prop key (match will switch into prop mode!!!)
|
|
607
|
-
## - fix - remove leading spaces in regex (upstream) - why? why not?
|
|
608
|
-
##
|
|
609
|
-
### switch into new mode
|
|
610
|
-
## switch context to PROP_RE
|
|
611
|
-
puts " ENTER PROP_RE MODE" if debug?
|
|
612
|
-
key = m[:key]
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
### todo/fix - add prop yellow/red cards too - why? why not?
|
|
616
|
-
## todo/fix - separate sent off and red card
|
|
617
|
-
## sent-off - incl. red card, yellow/red card and the era before red cards!!
|
|
618
|
-
if ['sent off'].include?( key.downcase)
|
|
619
|
-
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
|
620
|
-
tokens << [:PROP_SENTOFF, m[:key]]
|
|
621
|
-
elsif ['red cards'].include?( key.downcase )
|
|
622
|
-
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
|
623
|
-
tokens << [:PROP_REDCARDS, m[:key]]
|
|
624
|
-
elsif ['yellow cards'].include?( key.downcase )
|
|
625
|
-
@re = PROP_CARDS_RE
|
|
626
|
-
tokens << [:PROP_YELLOWCARDS, m[:key]]
|
|
627
|
-
elsif ['ref', 'referee',
|
|
628
|
-
'refs', 'referees' ## note - allow/support assistant refs
|
|
629
|
-
].include?( key.downcase )
|
|
630
|
-
@re = PROP_REFEREE_RE
|
|
631
|
-
tokens << [:PROP_REFEREE, m[:key]]
|
|
632
|
-
elsif ['att', 'attn', 'attendance'].include?( key.downcase )
|
|
633
|
-
@re = PROP_ATTENDANCE_RE
|
|
634
|
-
tokens << [:PROP_ATTENDANCE, m[:key]]
|
|
635
|
-
|
|
636
|
-
# elsif ['goals'].include?( key.downcase )
|
|
637
|
-
# @re = PROP_GOAL_RE
|
|
638
|
-
# tokens << [:PROP_GOALS, m[:key]]
|
|
639
|
-
|
|
640
|
-
elsif ['penalties',
|
|
641
|
-
'penalty shootout',
|
|
642
|
-
'penalty shoot-out',
|
|
643
|
-
'penalty kicks'].include?( key.downcase )
|
|
644
|
-
@re = PROP_PENALTIES_RE
|
|
645
|
-
tokens << [:PROP_PENALTIES, m[:key]]
|
|
646
|
-
else ## assume (team) line-up
|
|
647
|
-
@re = PROP_RE ## use LINEUP_RE ???
|
|
648
|
-
tokens << [:PROP, m[:key]]
|
|
649
|
-
end
|
|
650
|
-
|
|
651
|
-
offsets = [m.begin(0), m.end(0)]
|
|
652
|
-
pos = offsets[1] ## update pos
|
|
653
|
-
###
|
|
654
|
-
### todo/fix
|
|
655
|
-
### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
|
|
656
|
-
elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
|
|
657
|
-
puts " ENTER ROUND_DEF_RE MODE" if debug?
|
|
658
|
-
@re = ROUND_DEF_RE
|
|
659
|
-
|
|
660
|
-
## note - return ROUND_DEF NOT ROUND_OUTLINE token
|
|
661
|
-
tokens << [:ROUND_DEF, m[:round_outline]]
|
|
662
|
-
|
|
663
|
-
offsets = [m.begin(0), m.end(0)]
|
|
664
|
-
pos = offsets[1] ## update pos
|
|
665
|
-
elsif (m = ROUND_OUTLINE_RE.match( line ))
|
|
666
|
-
puts " ROUND_OUTLINE" if debug?
|
|
667
|
-
## note - derive round level from no of (leading) markers
|
|
668
|
-
## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
|
|
669
|
-
## note - ascii-style starts with double ::, thus, autodecrement by one!
|
|
670
|
-
round_level = m[:round_marker].size
|
|
671
|
-
round_level -= 1 if m[:round_marker].start_with?( '::' )
|
|
672
|
-
|
|
673
|
-
tokens << [:ROUND_OUTLINE, [m[:round_outline],
|
|
674
|
-
{ outline: m[:round_outline] ,
|
|
675
|
-
level: round_level}]]
|
|
676
|
-
|
|
677
|
-
## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
|
|
678
|
-
offsets = [m.begin(0), m.end(0)]
|
|
679
|
-
pos = offsets[1] ## update pos
|
|
680
|
-
elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume
|
|
681
|
-
## switch context to GOAL_RE (goalline(s))
|
|
682
|
-
####
|
|
683
|
-
## note - check for alternate goal line styles / formats
|
|
684
|
-
if START_GOAL_LINE_COMPAT_RE.match(line )
|
|
685
|
-
## "legacy" style starting with minute e.g.
|
|
686
|
-
## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
|
|
687
|
-
## 84 Rahn 3-2)
|
|
688
|
-
@re = GOAL_COMPAT_RE
|
|
689
|
-
puts " ENTER GOAL_COMPAT_RE MODE" if debug?
|
|
690
|
-
|
|
691
|
-
tokens << [:GOALS_COMPAT, "<|GOALS_COMPAT|>"]
|
|
692
|
-
elsif START_GOAL_LINE_ALT_RE.match( line )
|
|
693
|
-
## goals with scores e.g.
|
|
694
|
-
## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
|
|
695
|
-
## -or-
|
|
696
|
-
## (Dion Beljo 1-0
|
|
697
|
-
## 1-1 Andreas Gruber
|
|
698
|
-
## Matthias Seidl 2-1)
|
|
699
|
-
@re = GOAL_ALT_RE
|
|
700
|
-
puts " ENTER GOAL_ALT_RE MODE" if debug?
|
|
701
|
-
|
|
702
|
-
tokens << [:GOALS_ALT, "<|GOALS_ALT|>"]
|
|
703
|
-
else
|
|
704
|
-
## "standard" / default style
|
|
705
|
-
@re = GOAL_RE
|
|
706
|
-
puts " ENTER GOAL_RE MODE" if debug?
|
|
707
|
-
|
|
708
|
-
tokens << [:GOALS, "<|GOALS|>"]
|
|
709
|
-
end
|
|
710
|
-
|
|
711
|
-
## note - eat-up ( for now
|
|
712
|
-
## pass along "virtual" GOALS or GOALS_ALT token
|
|
713
|
-
## (see INLINE_GOALS for the starting goal line inline)
|
|
714
|
-
offsets = [m.begin(0), m.end(0)]
|
|
715
|
-
pos = offsets[1] ## update pos
|
|
716
|
-
end
|
|
717
|
-
end
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
old_pos = -1 ## allows to backtrack to old pos (used in geo)
|
|
722
|
-
|
|
723
|
-
while m = @re.match( line, pos )
|
|
724
|
-
# if debug?
|
|
725
|
-
# pp m
|
|
726
|
-
# puts "pos: #{pos}"
|
|
727
|
-
# end
|
|
728
|
-
offsets = [m.begin(0), m.end(0)]
|
|
729
|
-
|
|
730
|
-
if offsets[0] != pos
|
|
731
|
-
## match NOT starting at start/begin position!!!
|
|
732
|
-
## report parse error!!!
|
|
733
|
-
msg = "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
734
|
-
puts msg
|
|
735
|
-
|
|
736
|
-
errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
737
|
-
log( msg )
|
|
738
|
-
end
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
##
|
|
742
|
-
## todo/fix - also check if possible
|
|
743
|
-
## if no match but not yet end off string!!!!
|
|
744
|
-
## report skipped text run too!!!
|
|
745
|
-
|
|
746
|
-
old_pos = pos
|
|
747
|
-
pos = offsets[1]
|
|
748
|
-
|
|
749
|
-
# pp offsets if debug?
|
|
750
|
-
|
|
751
|
-
##
|
|
752
|
-
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
|
753
|
-
## for VAL use "text" or ["text", { opts }] array
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
t = if @re == ROUND_DEF_RE
|
|
757
|
-
if m[:spaces] || m[:space]
|
|
758
|
-
nil ## skip spaces
|
|
759
|
-
elsif m[:date]
|
|
760
|
-
[:DATE, [m[:date], _build_date( m )]]
|
|
761
|
-
elsif m[:duration]
|
|
762
|
-
[:DURATION, [m[:duration], _build_duration( m )]]
|
|
763
|
-
elsif m[:sym]
|
|
764
|
-
sym = m[:sym]
|
|
765
|
-
case sym
|
|
766
|
-
when '|' then [:'|']
|
|
767
|
-
when ':' then [:':']
|
|
768
|
-
when ',' then [:',']
|
|
769
|
-
else
|
|
770
|
-
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
771
|
-
nil ## ignore others (e.g. brackets [])
|
|
772
|
-
end
|
|
773
|
-
elsif m[:any]
|
|
774
|
-
## todo/check log error
|
|
775
|
-
msg = "parse error (tokenize round_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
776
|
-
puts "!! WARN - #{msg}"
|
|
777
|
-
|
|
778
|
-
errors << msg
|
|
779
|
-
log( "!! WARN - #{msg}" )
|
|
780
|
-
|
|
781
|
-
nil
|
|
782
|
-
else
|
|
783
|
-
## report error/raise expection
|
|
784
|
-
puts "!!! TOKENIZE ERROR - no match found"
|
|
785
|
-
nil
|
|
786
|
-
end
|
|
787
|
-
elsif @re == GROUP_DEF_RE
|
|
788
|
-
if m[:spaces] || m[:space]
|
|
789
|
-
nil ## skip spaces
|
|
790
|
-
elsif m[:text]
|
|
791
|
-
[:TEAM, m[:text]]
|
|
792
|
-
elsif m[:sym]
|
|
793
|
-
sym = m[:sym]
|
|
794
|
-
case sym
|
|
795
|
-
when '|' then [:'|']
|
|
796
|
-
when ':' then [:':']
|
|
797
|
-
when ',' then [:',']
|
|
798
|
-
else
|
|
799
|
-
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
800
|
-
nil ## ignore others (e.g. brackets [])
|
|
801
|
-
end
|
|
802
|
-
elsif m[:any]
|
|
803
|
-
## todo/check log error
|
|
804
|
-
msg = "parse error (tokenize group_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
805
|
-
puts "!! WARN - #{msg}"
|
|
806
|
-
|
|
807
|
-
errors << msg
|
|
808
|
-
log( "!! WARN - #{msg}" )
|
|
809
|
-
|
|
810
|
-
nil
|
|
811
|
-
else
|
|
812
|
-
## report error/raise expection
|
|
813
|
-
puts "!!! TOKENIZE ERROR - no match found"
|
|
814
|
-
nil
|
|
815
|
-
end
|
|
816
|
-
elsif @re == GEO_RE
|
|
817
|
-
### note - possibly end inline geo on [ (and others?? in the future
|
|
818
|
-
## note: break on double spaces e.g.
|
|
819
|
-
## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England
|
|
820
|
-
if m[:spaces]
|
|
821
|
-
### note - do NOT break out
|
|
822
|
-
## if not text seen yet!!!
|
|
823
|
-
if geo_count > 0
|
|
824
|
-
## get out-off geo mode and backtrack (w/ next)
|
|
825
|
-
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
826
|
-
@re = RE
|
|
827
|
-
pos = old_pos
|
|
828
|
-
next ## backtrack (resume new loop step)
|
|
829
|
-
else
|
|
830
|
-
nil ## skip spaces
|
|
831
|
-
end
|
|
832
|
-
elsif m[:space]
|
|
833
|
-
nil ## skip (single) space
|
|
834
|
-
elsif m[:text]
|
|
835
|
-
geo_count += 1
|
|
836
|
-
[:GEO, m[:text]] ## keep pos - why? why not?
|
|
837
|
-
elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!!
|
|
838
|
-
## get out-off geo mode and backtrack (w/ next)
|
|
839
|
-
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
840
|
-
@re = RE
|
|
841
|
-
pos = old_pos
|
|
842
|
-
next ## backtrack (resume new loop step)
|
|
843
|
-
elsif m[:sym]
|
|
844
|
-
sym = m[:sym]
|
|
845
|
-
## return symbols "inline" as is - why? why not?
|
|
846
|
-
## (?<sym>[;,@|\[\]-])
|
|
847
|
-
case sym
|
|
848
|
-
## note - reset geo_count to 0 (avoids break on two spaces)
|
|
849
|
-
## if separator seen!!
|
|
850
|
-
when ',' then geo_count = 0; [:',']
|
|
851
|
-
when '›' then geo_count = 0; [:','] ## note - treat geo sep › (unicode) like comma for now!!!
|
|
852
|
-
when '>' then geo_count = 0; [:','] ## note - treat geo sep > (ascii) like comma for now!!!
|
|
853
|
-
when '[' then
|
|
854
|
-
## get out-off geo mode and backtrack (w/ next)
|
|
855
|
-
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
856
|
-
@re = RE
|
|
857
|
-
pos = old_pos
|
|
858
|
-
next ## backtrack (resume new loop step)
|
|
859
|
-
else
|
|
860
|
-
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
861
|
-
nil ## ignore others (e.g. brackets [])
|
|
862
|
-
end
|
|
863
|
-
elsif m[:any]
|
|
864
|
-
## todo/check log error
|
|
865
|
-
msg = "parse error (tokenize geo) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
866
|
-
puts "!! WARN - #{msg}"
|
|
867
|
-
|
|
868
|
-
errors << msg
|
|
869
|
-
log( "!! WARN - #{msg}" )
|
|
870
|
-
|
|
871
|
-
nil
|
|
872
|
-
else
|
|
873
|
-
## report error/raise expection
|
|
874
|
-
puts "!!! TOKENIZE ERROR - no match found"
|
|
875
|
-
nil
|
|
876
|
-
end
|
|
877
|
-
elsif @re == PROP_CARDS_RE
|
|
878
|
-
if m[:space] || m[:spaces]
|
|
879
|
-
nil ## skip space(s)
|
|
880
|
-
elsif m[:prop_name]
|
|
881
|
-
[:PROP_NAME, m[:name]]
|
|
882
|
-
elsif m[:minute]
|
|
883
|
-
minute = {}
|
|
884
|
-
minute[:m] = m[:value].to_i(10)
|
|
885
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
886
|
-
## note - for debugging keep (pass along) "literal" minute
|
|
887
|
-
[:MINUTE, [m[:minute], minute]]
|
|
888
|
-
elsif m[:sym]
|
|
889
|
-
sym = m[:sym]
|
|
890
|
-
case sym
|
|
891
|
-
when ',' then [:',']
|
|
892
|
-
when ';' then [:';']
|
|
893
|
-
when '-' then [:'-']
|
|
894
|
-
else
|
|
895
|
-
nil ## ignore others (e.g. brackets [])
|
|
896
|
-
end
|
|
897
|
-
else
|
|
898
|
-
## report error
|
|
899
|
-
puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found"
|
|
900
|
-
nil
|
|
901
|
-
end
|
|
902
|
-
elsif @re == PROP_RE ### todo/fix - change to LINEUP_RE !!!!
|
|
903
|
-
if m[:space] || m[:spaces]
|
|
904
|
-
nil ## skip space(s)
|
|
905
|
-
elsif m[:prop_key] ## check for inline prop keys
|
|
906
|
-
key = m[:key]
|
|
907
|
-
## supported for now coach/trainer (add manager?)
|
|
908
|
-
if ['coach',
|
|
909
|
-
'trainer'].include?( key.downcase )
|
|
910
|
-
[:COACH, m[:key]] ## use COACH_KEY or such - why? why not?
|
|
911
|
-
else
|
|
912
|
-
## report error - for unknown (inline) prop key in lineup
|
|
913
|
-
nil
|
|
914
|
-
end
|
|
915
|
-
elsif m[:inline_captain]
|
|
916
|
-
[:INLINE_CAPTAIN, m[:inline_captain]]
|
|
917
|
-
elsif m[:inline_yellow]
|
|
918
|
-
card = {}
|
|
919
|
-
card[:m] = m[:minute].to_i(10) if m[:minute]
|
|
920
|
-
card[:offset] = m[:offset].to_i(10) if m[:offset]
|
|
921
|
-
[:INLINE_YELLOW, [m[:inline_yellow], card]]
|
|
922
|
-
elsif m[:inline_red]
|
|
923
|
-
card = {}
|
|
924
|
-
card[:m] = m[:minute].to_i(10) if m[:minute]
|
|
925
|
-
card[:offset] = m[:offset].to_i(10) if m[:offset]
|
|
926
|
-
[:INLINE_RED, [m[:inline_red], card]]
|
|
927
|
-
elsif m[:inline_yellow_red]
|
|
928
|
-
card = {}
|
|
929
|
-
card[:m] = m[:minute].to_i(10) if m[:minute]
|
|
930
|
-
card[:offset] = m[:offset].to_i(10) if m[:offset]
|
|
931
|
-
[:INLINE_YELLOW_RED, [m[:inline_yellow_red], card]]
|
|
932
|
-
elsif m[:prop_name]
|
|
933
|
-
[:PROP_NAME, m[:name]]
|
|
934
|
-
elsif m[:minute]
|
|
935
|
-
minute = {}
|
|
936
|
-
minute[:m] = m[:value].to_i(10)
|
|
937
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
938
|
-
[:MINUTE, [m[:minute], minute]]
|
|
939
|
-
elsif m[:sym]
|
|
940
|
-
sym = m[:sym]
|
|
941
|
-
## return symbols "inline" as is - why? why not?
|
|
942
|
-
## (?<sym>[;,@|\[\]-])
|
|
943
|
-
|
|
944
|
-
case sym
|
|
945
|
-
when ',' then [:',']
|
|
946
|
-
when ';' then [:';']
|
|
947
|
-
when '[' then [:'[']
|
|
948
|
-
when ']' then [:']']
|
|
949
|
-
when '(' then [:'(']
|
|
950
|
-
when ')' then [:')']
|
|
951
|
-
when '-' then [:'-']
|
|
952
|
-
else
|
|
953
|
-
nil ## ignore others (e.g. brackets [])
|
|
954
|
-
end
|
|
955
|
-
else
|
|
956
|
-
## report error
|
|
957
|
-
puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
|
|
958
|
-
nil
|
|
959
|
-
end
|
|
960
|
-
elsif @re == PROP_ATTENDANCE_RE
|
|
961
|
-
if m[:space] || m[:spaces]
|
|
962
|
-
nil ## skip space(s)
|
|
963
|
-
elsif m[:enclosed_name]
|
|
964
|
-
## reserverd for use for sold out or such (in the future) - why? why not?
|
|
965
|
-
[:ENCLOSED_NAME, m[:name]]
|
|
966
|
-
elsif m[:num]
|
|
967
|
-
[:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
|
|
968
|
-
=begin
|
|
969
|
-
elsif m[:sym]
|
|
970
|
-
sym = m[:sym]
|
|
971
|
-
case sym
|
|
972
|
-
when ',' then [:',']
|
|
973
|
-
when ';' then [:';']
|
|
974
|
-
# when '[' then [:'[']
|
|
975
|
-
# when ']' then [:']']
|
|
976
|
-
else
|
|
977
|
-
nil ## ignore others (e.g. brackets [])
|
|
978
|
-
end
|
|
979
|
-
=end
|
|
980
|
-
else
|
|
981
|
-
## report error
|
|
982
|
-
puts "!!! TOKENIZE ERROR (PROP_ATTENDANCE_RE) - no match found"
|
|
983
|
-
nil
|
|
984
|
-
end
|
|
985
|
-
elsif @re == PROP_REFEREE_RE
|
|
986
|
-
if m[:space] || m[:spaces]
|
|
987
|
-
nil ## skip space(s)
|
|
988
|
-
elsif m[:prop_key] ## check for inline prop keys
|
|
989
|
-
key = m[:key]
|
|
990
|
-
## supported for now coach/trainer (add manager?)
|
|
991
|
-
if ['att', 'attn', 'attendance' ].include?( key.downcase )
|
|
992
|
-
[:ATTENDANCE, m[:key]] ## use COACH_KEY or such - why? why not?
|
|
993
|
-
else
|
|
994
|
-
## report error - for unknown (inline) prop key in lineup
|
|
995
|
-
nil
|
|
996
|
-
end
|
|
997
|
-
elsif m[:prop_name] ## note - change prop_name to player
|
|
998
|
-
[:PROP_NAME, m[:name]] ### use PLAYER for token - why? why not?
|
|
999
|
-
elsif m[:num]
|
|
1000
|
-
[:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
|
|
1001
|
-
elsif m[:enclosed_name]
|
|
1002
|
-
## use HOLD,SAVE,POST or such keys - why? why not?
|
|
1003
|
-
[:ENCLOSED_NAME, m[:name]]
|
|
1004
|
-
elsif m[:sym]
|
|
1005
|
-
sym = m[:sym]
|
|
1006
|
-
case sym
|
|
1007
|
-
when ',' then [:',']
|
|
1008
|
-
when ';' then [:';']
|
|
1009
|
-
# when '[' then [:'[']
|
|
1010
|
-
# when ']' then [:']']
|
|
1011
|
-
else
|
|
1012
|
-
nil ## ignore others (e.g. brackets [])
|
|
1013
|
-
end
|
|
1014
|
-
else
|
|
1015
|
-
## report error
|
|
1016
|
-
puts "!!! TOKENIZE ERROR (PROP_REFEREE_RE) - no match found"
|
|
1017
|
-
nil
|
|
1018
|
-
end
|
|
1019
|
-
elsif @re == PROP_PENALTIES_RE
|
|
1020
|
-
if m[:space] || m[:spaces]
|
|
1021
|
-
nil ## skip space(s)
|
|
1022
|
-
elsif m[:prop_name] ## note - change prop_name to player
|
|
1023
|
-
[:PROP_NAME, m[:name]] ### use PLAYER for token - why? why not?
|
|
1024
|
-
elsif m[:enclosed_name]
|
|
1025
|
-
## use HOLD,SAVE,POST or such keys - why? why not?
|
|
1026
|
-
[:ENCLOSED_NAME, m[:name]]
|
|
1027
|
-
elsif m[:score]
|
|
1028
|
-
score = {}
|
|
1029
|
-
## must always have ft for now e.g. 1-1 or such
|
|
1030
|
-
### change to (generic) score from ft -
|
|
1031
|
-
## might be score a.e.t. or such - why? why not?
|
|
1032
|
-
score[:score] = [m[:score1].to_i(10),
|
|
1033
|
-
m[:score2].to_i(10)]
|
|
1034
|
-
[:SCORE, [m[:score], score]]
|
|
1035
|
-
elsif m[:sym]
|
|
1036
|
-
sym = m[:sym]
|
|
1037
|
-
case sym
|
|
1038
|
-
when ',' then [:',']
|
|
1039
|
-
when ';' then [:';']
|
|
1040
|
-
when '[' then [:'[']
|
|
1041
|
-
when ']' then [:']']
|
|
1042
|
-
else
|
|
1043
|
-
nil ## ignore others (e.g. brackets [])
|
|
1044
|
-
end
|
|
1045
|
-
else
|
|
1046
|
-
## report error
|
|
1047
|
-
puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
|
|
1048
|
-
nil
|
|
1049
|
-
end
|
|
1050
|
-
elsif @re == GOAL_COMPAT_RE
|
|
1051
|
-
if m[:space] || m[:spaces]
|
|
1052
|
-
nil ## skip space(s)
|
|
1053
|
-
elsif m[:prop_name] ## note - change prop_name to player
|
|
1054
|
-
[:PLAYER, m[:name]]
|
|
1055
|
-
elsif m[:minute]
|
|
1056
|
-
minute = _build_minute( m )
|
|
1057
|
-
[:MINUTE, [m[:minute], minute]]
|
|
1058
|
-
elsif m[:goal_type]
|
|
1059
|
-
goal_type = _build_goal_type( m )
|
|
1060
|
-
[:GOAL_TYPE, [m[:goal_type], goal_type]]
|
|
1061
|
-
elsif m[:score]
|
|
1062
|
-
score = {}
|
|
1063
|
-
## note - score is "generic"
|
|
1064
|
-
## might be full-time (ft) or
|
|
1065
|
-
## after extra-time (aet) or such
|
|
1066
|
-
## or even undecided/unknown
|
|
1067
|
-
## thus, use score1/score2 and NOT ft1/ft2
|
|
1068
|
-
score[:score] = [m[:score1].to_i(10),
|
|
1069
|
-
m[:score2].to_i(10)]
|
|
1070
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1071
|
-
[:SCORE, [m[:score], score]]
|
|
1072
|
-
elsif m[:sym]
|
|
1073
|
-
sym = m[:sym]
|
|
1074
|
-
## return symbols "inline" as is - why? why not?
|
|
1075
|
-
## (?<sym>[;,@|\[\]-])
|
|
1076
|
-
|
|
1077
|
-
case sym
|
|
1078
|
-
when ',' then [:',']
|
|
1079
|
-
when ')' ## leave goal mode!!
|
|
1080
|
-
puts " LEAVE GOAL_COMPAT_RE MODE" if debug?
|
|
1081
|
-
@re = RE
|
|
1082
|
-
## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
|
|
1083
|
-
## or GOAL_PAREN_CLOSE/END ???
|
|
1084
|
-
[:GOALS_END, '<|GOALS_END|>']
|
|
1085
|
-
else
|
|
1086
|
-
nil ## ignore others (e.g. brackets [])
|
|
1087
|
-
end
|
|
1088
|
-
else
|
|
1089
|
-
## report error
|
|
1090
|
-
puts "!!! TOKENIZE ERROR (GOAL_COMPAT_RE) - no match found"
|
|
1091
|
-
nil
|
|
1092
|
-
end
|
|
1093
|
-
elsif @re == GOAL_ALT_RE
|
|
1094
|
-
if m[:space] || m[:spaces]
|
|
1095
|
-
nil ## skip space(s)
|
|
1096
|
-
elsif m[:prop_name] ## note - change prop_name to player
|
|
1097
|
-
[:PLAYER, m[:name]]
|
|
1098
|
-
elsif m[:goal_minute]
|
|
1099
|
-
minute = _build_goal_minute( m )
|
|
1100
|
-
[:GOAL_MINUTE, [m[:goal_minute], minute]]
|
|
1101
|
-
elsif m[:goal_type]
|
|
1102
|
-
goal_type = _build_goal_type( m )
|
|
1103
|
-
[:GOAL_TYPE, [m[:goal_type], goal_type]]
|
|
1104
|
-
elsif m[:score]
|
|
1105
|
-
score = {}
|
|
1106
|
-
## note - score is "generic"
|
|
1107
|
-
## might be full-time (ft) or
|
|
1108
|
-
## after extra-time (aet) or such
|
|
1109
|
-
## or even undecided/unknown
|
|
1110
|
-
## thus, use score1/score2 and NOT ft1/ft2
|
|
1111
|
-
score[:score] = [m[:score1].to_i(10),
|
|
1112
|
-
m[:score2].to_i(10)]
|
|
1113
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1114
|
-
[:SCORE, [m[:score], score]]
|
|
1115
|
-
elsif m[:sym]
|
|
1116
|
-
sym = m[:sym]
|
|
1117
|
-
## return symbols "inline" as is - why? why not?
|
|
1118
|
-
## (?<sym>[;,@|\[\]-])
|
|
1119
|
-
|
|
1120
|
-
case sym
|
|
1121
|
-
when ',' then [:',']
|
|
1122
|
-
when ')' ## leave goal mode!!
|
|
1123
|
-
puts " LEAVE GOAL_ALT_RE MODE" if debug?
|
|
1124
|
-
@re = RE
|
|
1125
|
-
## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
|
|
1126
|
-
## or GOAL_PAREN_CLOSE/END ???
|
|
1127
|
-
[:GOALS_END, '<|GOALS_END|>']
|
|
1128
|
-
else
|
|
1129
|
-
nil ## ignore others (e.g. brackets [])
|
|
1130
|
-
end
|
|
1131
|
-
else
|
|
1132
|
-
## report error
|
|
1133
|
-
puts "!!! TOKENIZE ERROR (GOAL_ALT_RE) - no match found"
|
|
1134
|
-
nil
|
|
1135
|
-
end
|
|
1136
|
-
elsif @re == GOAL_RE
|
|
1137
|
-
if m[:space] || m[:spaces]
|
|
1138
|
-
nil ## skip space(s)
|
|
1139
|
-
elsif m[:goals_none] ## note - eats-up semicolon!! e.g. -; or - ;
|
|
1140
|
-
[:GOALS_NONE, "<|GOALS_NONE|>"]
|
|
1141
|
-
elsif m[:goal_sep_alt]
|
|
1142
|
-
[:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ] ## e.g. dash (-) WITH leading & trailing space required
|
|
1143
|
-
elsif m[:prop_name] ## note - change prop_name to player
|
|
1144
|
-
[:PLAYER, m[:name]]
|
|
1145
|
-
elsif m[:goal_minute]
|
|
1146
|
-
minute = _build_goal_minute( m )
|
|
1147
|
-
[:GOAL_MINUTE, [m[:goal_minute], minute]]
|
|
1148
|
-
elsif m[:goal_count]
|
|
1149
|
-
count = _build_goal_count( m )
|
|
1150
|
-
[:GOAL_COUNT, [m[:goal_count], count]]
|
|
1151
|
-
elsif m[:sym]
|
|
1152
|
-
sym = m[:sym]
|
|
1153
|
-
## return symbols "inline" as is - why? why not?
|
|
1154
|
-
## (?<sym>[;,@|\[\]-])
|
|
1155
|
-
|
|
1156
|
-
case sym
|
|
1157
|
-
when ',' then [:',']
|
|
1158
|
-
when ';' then [:';']
|
|
1159
|
-
# when '[' then [:'[']
|
|
1160
|
-
# when ']' then [:']']
|
|
1161
|
-
when ')' ## leave goal mode!!
|
|
1162
|
-
puts " LEAVE GOAL_RE MODE" if debug?
|
|
1163
|
-
@re = RE
|
|
1164
|
-
## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS)
|
|
1165
|
-
## or GOAL_PAREN_CLOSE/END ???
|
|
1166
|
-
[:GOALS_END, '<|GOALS_END|>']
|
|
1167
|
-
else
|
|
1168
|
-
nil ## ignore others (e.g. brackets [])
|
|
1169
|
-
end
|
|
1170
|
-
else
|
|
1171
|
-
## report error
|
|
1172
|
-
puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
|
|
1173
|
-
nil
|
|
1174
|
-
end
|
|
1175
|
-
###################################################
|
|
1176
|
-
## assume TOP_LEVEL (a.k.a. RE) machinery
|
|
1177
|
-
else
|
|
1178
|
-
if m[:space] || m[:spaces]
|
|
1179
|
-
nil ## skip space(s)
|
|
1180
|
-
elsif m[:text]
|
|
1181
|
-
## note - top-level (for now always) assumes TEAM for TEXT match!!
|
|
1182
|
-
[:TEAM, m[:text]] ## keep pos - why? why not?
|
|
1183
|
-
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
|
1184
|
-
[:STATUS, [m[:status], _build_status( m ) ]]
|
|
1185
|
-
elsif m[:inline_wo] ## w/o - walkover (match status)
|
|
1186
|
-
[:INLINE_WO, m[:inline_wo]]
|
|
1187
|
-
elsif m[:inline_np] ## n/p - not played (match status)
|
|
1188
|
-
[:INLINE_NP, m[:inline_np]]
|
|
1189
|
-
elsif m[:inline_bye] ## bye (match status)
|
|
1190
|
-
[:INLINE_BYE, m[:inline_bye]]
|
|
1191
|
-
elsif m[:inline_abd] ## abd/abd. - abandoned (match status)
|
|
1192
|
-
[:INLINE_ABD, m[:inline_abd]]
|
|
1193
|
-
elsif m[:inline_void] ## abd/abd. - abandoned (match status)
|
|
1194
|
-
[:INLINE_VOID, m[:inline_void]]
|
|
1195
|
-
elsif m[:inline_susp] ## susp/susp. - suspended (match status)
|
|
1196
|
-
[:INLINE_SUSP, m[:inline_susp]]
|
|
1197
|
-
elsif m[:inline_ppd] ## ppd/ppd. or postp/postp. - postponed (match status)
|
|
1198
|
-
[:INLINE_PPD, m[:inline_ppd]]
|
|
1199
|
-
elsif m[:inline_awd] ## awd/awd. - awarded (match status)
|
|
1200
|
-
[:INLINE_AWD, m[:inline_awd]]
|
|
1201
|
-
elsif m[:inline_canc] ## canc/canc. - cancelled/canceled (match status)
|
|
1202
|
-
[:INLINE_CANC, m[:inline_canc]]
|
|
1203
|
-
|
|
1204
|
-
elsif m[:team_home]
|
|
1205
|
-
[:TEAM_HOME, m[:team_home]]
|
|
1206
|
-
elsif m[:team_away]
|
|
1207
|
-
[:TEAM_AWAY, m[:team_away]]
|
|
1208
|
-
elsif m[:team_neutral]
|
|
1209
|
-
[:TEAM_NEUTRAL, m[:team_neutral]]
|
|
1210
|
-
|
|
1211
|
-
elsif m[:attendance]
|
|
1212
|
-
att = {}
|
|
1213
|
-
att[:value] = m[:value].gsub( '_', '' ).to_i(10)
|
|
1214
|
-
## note - for token id use INLINE_ATTENDANCE (ATTENDANCE in use for prop!!!)
|
|
1215
|
-
[:INLINE_ATTENDANCE, [m[:attendance], att ]]
|
|
1216
|
-
elsif m[:note]
|
|
1217
|
-
### todo/check:
|
|
1218
|
-
## use value hash - why? why not? or simplify to:
|
|
1219
|
-
## [:NOTE, [m[:note], {note: m[:note] } ]]
|
|
1220
|
-
[:NOTE, m[:note]]
|
|
1221
|
-
elsif m[:time]
|
|
1222
|
-
[:TIME, [m[:time], _build_time(m)]]
|
|
1223
|
-
elsif m[:date]
|
|
1224
|
-
[:DATE, [m[:date], _build_date(m)]]
|
|
1225
|
-
elsif m[:date_legs]
|
|
1226
|
-
[:DATE_LEGS, [m[:date_legs], _build_date_legs(m)]]
|
|
1227
|
-
elsif m[:score_team]
|
|
1228
|
-
[:SCORE_TEAM, [m[:score_team], _build_score_team(m)]]
|
|
1229
|
-
elsif m[:score_team_pen]
|
|
1230
|
-
[:SCORE_TEAM_PEN, [m[:score_team_pen], _build_score_team_pen(m)]]
|
|
1231
|
-
elsif m[:score_team_num]
|
|
1232
|
-
[:SCORE_TEAM_NUM, [m[:score_team_num], _build_score_team_num(m)]]
|
|
1233
|
-
elsif m[:score_legs]
|
|
1234
|
-
legs = {}
|
|
1235
|
-
|
|
1236
|
-
### leg1
|
|
1237
|
-
score = {}
|
|
1238
|
-
score[:ft] = [m[:leg1_ft1].to_i(10),
|
|
1239
|
-
m[:leg1_ft2].to_i(10)]
|
|
1240
|
-
legs['leg1'] = score
|
|
1241
|
-
|
|
1242
|
-
### leg2
|
|
1243
|
-
score = {}
|
|
1244
|
-
score[:ft] = [m[:leg2_ft1].to_i(10),
|
|
1245
|
-
m[:leg2_ft2].to_i(10)] if m[:leg2_ft1] && m[:leg2_ft2]
|
|
1246
|
-
score[:et] = [m[:leg2_et1].to_i(10),
|
|
1247
|
-
m[:leg2_et2].to_i(10)] if m[:leg2_et1] && m[:leg2_et2]
|
|
1248
|
-
score[:p] = [m[:leg2_p1].to_i(10),
|
|
1249
|
-
m[:leg2_p2].to_i(10)] if m[:leg2_p1] && m[:leg2_p2]
|
|
1250
|
-
legs['leg2'] = score
|
|
1251
|
-
|
|
1252
|
-
## check for (opt) aggregate - keep on "top-level"
|
|
1253
|
-
legs[:agg] = [m[:agg1].to_i(10),
|
|
1254
|
-
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1255
|
-
legs[:away] = true if m[:away]
|
|
1256
|
-
|
|
1257
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1258
|
-
[:SCORE_LEGS, [m[:score_legs], legs]]
|
|
1259
|
-
elsif m[:score_full]
|
|
1260
|
-
score = {}
|
|
1261
|
-
score[:p] = [m[:p1].to_i(10),
|
|
1262
|
-
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
|
1263
|
-
score[:et] = [m[:et1].to_i(10),
|
|
1264
|
-
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
|
1265
|
-
score[:ft] = [m[:ft1].to_i(10),
|
|
1266
|
-
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
|
1267
|
-
score[:ht] = [m[:ht1].to_i(10),
|
|
1268
|
-
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
|
1269
|
-
score[:agg] = [m[:agg1].to_i(10),
|
|
1270
|
-
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1271
|
-
|
|
1272
|
-
if m[:away1] && m[:away2]
|
|
1273
|
-
score[:away] = [m[:away1].to_i(10),
|
|
1274
|
-
m[:away2].to_i(10)]
|
|
1275
|
-
elsif m[:away] ## fallback if no away score; check away flag
|
|
1276
|
-
score[:away] = true
|
|
1277
|
-
end
|
|
1278
|
-
|
|
1279
|
-
## add golden/silver flags
|
|
1280
|
-
score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
|
|
1281
|
-
score[:silver] = true if m[:aetsg] ## silver goal (sg)
|
|
1282
|
-
|
|
1283
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1284
|
-
[:SCORE_FULL, [m[:score_full], score]]
|
|
1285
|
-
elsif m[:score_fuller]
|
|
1286
|
-
score = {}
|
|
1287
|
-
score[:p] = [m[:p1].to_i(10),
|
|
1288
|
-
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
|
1289
|
-
score[:et] = [m[:et1].to_i(10),
|
|
1290
|
-
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
|
1291
|
-
score[:ft] = [m[:ft1].to_i(10),
|
|
1292
|
-
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
|
1293
|
-
score[:ht] = [m[:ht1].to_i(10),
|
|
1294
|
-
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
|
1295
|
-
score[:agg] = [m[:agg1].to_i(10),
|
|
1296
|
-
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1297
|
-
if m[:away1] && m[:away2]
|
|
1298
|
-
score[:away] = [m[:away1].to_i(10),
|
|
1299
|
-
m[:away2].to_i(10)]
|
|
1300
|
-
elsif m[:away] ## fallback if no away score; check away flag
|
|
1301
|
-
score[:away] = true
|
|
1302
|
-
end
|
|
1303
|
-
|
|
1304
|
-
## add aet flag true/false
|
|
1305
|
-
# score[:aet] = true if m[:aet] || m[:aetgg] || m[:aetsg]
|
|
1306
|
-
|
|
1307
|
-
## add golden/silver flags
|
|
1308
|
-
score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
|
|
1309
|
-
score[:silver] = true if m[:aetsg] ## silver goal (sg)
|
|
1310
|
-
|
|
1311
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1312
|
-
[:SCORE_FULLER, [m[:score_fuller], score]]
|
|
1313
|
-
elsif m[:score_fuller_more]
|
|
1314
|
-
## SCORE + SCORE_FULLER_MORE
|
|
1315
|
-
## note - after extra-time (aet) or full-time (ft)
|
|
1316
|
-
## score may be present in SCORE!!!
|
|
1317
|
-
score = {}
|
|
1318
|
-
score[:p] = [m[:p1].to_i(10),
|
|
1319
|
-
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
|
1320
|
-
score[:et] = [m[:et1].to_i(10),
|
|
1321
|
-
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
|
1322
|
-
score[:ft] = [m[:ft1].to_i(10),
|
|
1323
|
-
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
|
1324
|
-
score[:ht] = [m[:ht1].to_i(10),
|
|
1325
|
-
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
|
1326
|
-
score[:agg] = [m[:agg1].to_i(10),
|
|
1327
|
-
m[:agg2].to_i(10)] if m[:agg1] && m[:agg2]
|
|
1328
|
-
if m[:away1] && m[:away2]
|
|
1329
|
-
score[:away] = [m[:away1].to_i(10),
|
|
1330
|
-
m[:away2].to_i(10)]
|
|
1331
|
-
elsif m[:away] ## fallback if no away score; check away flag
|
|
1332
|
-
score[:away] = true
|
|
1333
|
-
end
|
|
1334
|
-
|
|
1335
|
-
## add flag in score for et/ft/ht
|
|
1336
|
-
score[:score] = 'et' if m[:aet] || m[:aetgg] || m[:aetsg]
|
|
1337
|
-
score[:score] = 'ft' if m[:ft]
|
|
1338
|
-
score[:score] = 'ht' if m[:ht]
|
|
1339
|
-
|
|
1340
|
-
## add golden/silver flags
|
|
1341
|
-
score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd)
|
|
1342
|
-
score[:silver] = true if m[:aetsg] ## silver goal (sg)
|
|
1343
|
-
|
|
1344
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1345
|
-
[:SCORE_FULLER_MORE, [m[:score_fuller_more], score]]
|
|
1346
|
-
elsif m[:score]
|
|
1347
|
-
score = {}
|
|
1348
|
-
## note - score is "generic"
|
|
1349
|
-
## might be full-time (ft) or
|
|
1350
|
-
## after extra-time (aet) or such
|
|
1351
|
-
## or even undecided/unknown
|
|
1352
|
-
## thus, use score1/score2 and NOT ft1/ft2
|
|
1353
|
-
score[:score] = [m[:score1].to_i(10),
|
|
1354
|
-
m[:score2].to_i(10)]
|
|
1355
|
-
## note - for debugging keep (pass along) "literal" score
|
|
1356
|
-
[:SCORE, [m[:score], score]]
|
|
1357
|
-
elsif m[:score_awd] ## score awarded (awd/awd.)
|
|
1358
|
-
score = {}
|
|
1359
|
-
### note - use "generic" score for now
|
|
1360
|
-
## to match A 3-0 B [awarded] etc.
|
|
1361
|
-
score[:score] = [m[:score1].to_i(10),
|
|
1362
|
-
m[:score2].to_i(10)]
|
|
1363
|
-
## add score[:awarded] = true ???
|
|
1364
|
-
## or only use match status to avoid duplicate?
|
|
1365
|
-
[:SCORE_AWD, [m[:score_awd], score]]
|
|
1366
|
-
elsif m[:score_abd] ## score abandonded (abd/abd.)
|
|
1367
|
-
score = {}
|
|
1368
|
-
### note - use "generic" score for now
|
|
1369
|
-
score[:score] = [m[:score1].to_i(10),
|
|
1370
|
-
m[:score2].to_i(10)]
|
|
1371
|
-
## add score[:awarded] = true ???
|
|
1372
|
-
## or only use match status to avoid duplicate?
|
|
1373
|
-
[:SCORE_ABD, [m[:score_abd], score]]
|
|
1374
|
-
elsif m[:minute]
|
|
1375
|
-
minute = {}
|
|
1376
|
-
minute[:m] = m[:value].to_i(10)
|
|
1377
|
-
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
|
1378
|
-
## note - for debugging keep (pass along) "literal" minute
|
|
1379
|
-
[:MINUTE, [m[:minute], minute]]
|
|
1380
|
-
elsif m[:vs]
|
|
1381
|
-
[:VS, m[:vs]]
|
|
1382
|
-
elsif m[:sym]
|
|
1383
|
-
sym = m[:sym]
|
|
1384
|
-
## return symbols "inline" as is - why? why not?
|
|
1385
|
-
## (?<sym>[;,@|\[\]-])
|
|
1386
|
-
|
|
1387
|
-
case sym
|
|
1388
|
-
when '@' ## enter geo mode
|
|
1389
|
-
puts " ENTER GEO_RE MODE" if debug?
|
|
1390
|
-
@re = GEO_RE
|
|
1391
|
-
geo_count = 0
|
|
1392
|
-
[:'@']
|
|
1393
|
-
when ',' then [:',']
|
|
1394
|
-
when ';' then [:';']
|
|
1395
|
-
when '/' then [:'/']
|
|
1396
|
-
when '|' then [:'|']
|
|
1397
|
-
when '[' then [:'[']
|
|
1398
|
-
when ']' then [:']']
|
|
1399
|
-
when '-' then [:'-']
|
|
1400
|
-
when '(' ## enter goal scorer mode on "free-floating" open paranthesis!!!
|
|
1401
|
-
puts " ENTER GOAL_RE MODE" if debug?
|
|
1402
|
-
@re = GOAL_RE
|
|
1403
|
-
## note - eat-up ( for now; do NOT pass along as token
|
|
1404
|
-
## pass along "virutal" INLINE GOALS - why? why not?
|
|
1405
|
-
[:INLINE_GOALS, "<|INLINE_GOALS|>"]
|
|
1406
|
-
when ')' then [:')']
|
|
1407
|
-
else
|
|
1408
|
-
puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
|
|
1409
|
-
nil ## ignore others (e.g. brackets [])
|
|
1410
|
-
end
|
|
1411
|
-
elsif m[:any]
|
|
1412
|
-
## todo/check log error
|
|
1413
|
-
msg = "parse error (tokenize) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
1414
|
-
puts "!! WARN - #{msg}"
|
|
1415
|
-
|
|
1416
|
-
errors << msg
|
|
1417
|
-
log( "!! WARN - #{msg}" )
|
|
1418
|
-
|
|
1419
|
-
nil
|
|
1420
|
-
else
|
|
1421
|
-
## report error
|
|
1422
|
-
puts "!!! TOKENIZE ERROR - no match found"
|
|
1423
|
-
nil
|
|
1424
|
-
end
|
|
1425
|
-
end
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
tokens << t if t
|
|
1429
|
-
|
|
1430
|
-
# if debug?
|
|
1431
|
-
# print ">"
|
|
1432
|
-
# print "*" * pos
|
|
1433
|
-
# puts "#{line[pos..-1]}<"
|
|
1434
|
-
# end
|
|
1435
|
-
end
|
|
1436
|
-
|
|
1437
|
-
## check if no match in end of string
|
|
1438
|
-
if offsets[1] != line.size
|
|
1439
|
-
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
|
1440
|
-
puts msg
|
|
1441
|
-
log( msg )
|
|
1442
|
-
|
|
1443
|
-
errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
|
1444
|
-
end
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
# if @re == GOAL_RE ### ALWAYS switch back to top level mode
|
|
1448
|
-
# puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
1449
|
-
# @re = RE
|
|
1450
|
-
# end
|
|
1451
|
-
|
|
1452
|
-
if @re == GEO_RE ### ALWAYS switch back to top level mode
|
|
1453
|
-
puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
1454
|
-
@re = RE
|
|
1455
|
-
end
|
|
1456
|
-
|
|
1457
|
-
@re = RE if @re == GROUP_DEF_RE ### ALWAYS switch back to top level mode
|
|
1458
|
-
@re = RE if @re == ROUND_DEF_RE
|
|
1459
|
-
|
|
1460
|
-
##
|
|
1461
|
-
## if in prop mode continue if last token is [,-]
|
|
1462
|
-
## otherwise change back to "standard" mode
|
|
1463
|
-
if @re == PROP_RE || @re == PROP_CARDS_RE ||
|
|
1464
|
-
@re == PROP_PENALTIES_RE ||
|
|
1465
|
-
@re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
|
|
1466
|
-
if [:',', :'-', :';'].include?( tokens[-1][0] )
|
|
1467
|
-
## continue/stay in PROP_RE mode
|
|
1468
|
-
## todo/check - auto-add PROP_CONT token or such
|
|
1469
|
-
## to help parser with possible NEWLINE
|
|
1470
|
-
## conflicts - why? why not?
|
|
1471
|
-
else
|
|
1472
|
-
## switch back to top-level mode!!
|
|
1473
|
-
puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
1474
|
-
@re = RE
|
|
1475
|
-
## note - auto-add PROP_END (<PROP_END>)
|
|
1476
|
-
tokens << [:PROP_END, "<|PROP_END|>"]
|
|
1477
|
-
end
|
|
1478
|
-
end
|
|
1479
252
|
|
|
1480
|
-
|
|
1481
|
-
[tokens,errors]
|
|
1482
|
-
end
|
|
1483
253
|
|
|
1484
254
|
end # class Lexer
|
|
1485
255
|
end # module SportDb
|