sportdb-parser 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +17 -4
- data/lib/sportdb/parser/lexer-on_goal.rb +172 -0
- data/lib/sportdb/parser/lexer-on_group_def.rb +31 -0
- data/lib/sportdb/parser/lexer-on_prop_lineup.rb +79 -0
- data/lib/sportdb/parser/lexer-on_prop_misc.rb +110 -0
- data/lib/sportdb/parser/lexer-on_prop_penalties.rb +40 -0
- data/lib/sportdb/parser/lexer-on_round_def.rb +37 -0
- data/lib/sportdb/parser/lexer-on_top.rb +125 -0
- data/lib/sportdb/parser/lexer-prep_doc.rb +131 -0
- data/lib/sportdb/parser/lexer-prep_line.rb +63 -0
- data/lib/sportdb/parser/lexer-tokenize.rb +449 -0
- data/lib/sportdb/parser/lexer.rb +133 -1363
- data/lib/sportdb/parser/lexer_buffer.rb +8 -37
- data/lib/sportdb/parser/lexer_token.rb +126 -0
- data/lib/sportdb/parser/parser.rb +1104 -1403
- data/lib/sportdb/parser/racc_parser.rb +36 -32
- data/lib/sportdb/parser/racc_tree.rb +65 -98
- data/lib/sportdb/parser/token-date--helpers.rb +130 -0
- data/lib/sportdb/parser/token-date--names.rb +108 -0
- data/lib/sportdb/parser/token-date.rb +20 -192
- data/lib/sportdb/parser/token-date_duration.rb +8 -27
- data/lib/sportdb/parser/token-geo.rb +16 -16
- data/lib/sportdb/parser/token-goals--helpers.rb +114 -0
- data/lib/sportdb/parser/token-goals.rb +103 -249
- data/lib/sportdb/parser/token-group.rb +8 -22
- data/lib/sportdb/parser/token-prop.rb +138 -124
- data/lib/sportdb/parser/token-prop_name.rb +48 -39
- data/lib/sportdb/parser/token-round.rb +21 -35
- data/lib/sportdb/parser/token-score--helpers.rb +189 -0
- data/lib/sportdb/parser/token-score.rb +9 -393
- data/lib/sportdb/parser/token-score_full.rb +331 -0
- data/lib/sportdb/parser/token-status.rb +44 -46
- data/lib/sportdb/parser/token-status_inline.rb +112 -0
- data/lib/sportdb/parser/token-text.rb +41 -31
- data/lib/sportdb/parser/token-time.rb +29 -26
- data/lib/sportdb/parser/token.rb +58 -159
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +45 -17
- metadata +19 -6
- data/lib/sportdb/parser/blocktxt.rb +0 -99
- data/lib/sportdb/parser/lexer_tty.rb +0 -111
- data/lib/sportdb/parser/token-table.rb +0 -149
- data/lib/sportdb/parser/token_helpers.rb +0 -92
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
HTML_COMMENT_RE = %r{ <!--
|
|
7
|
+
.*? ## note - use non-greedy/lazy *? match
|
|
8
|
+
-->
|
|
9
|
+
}xm ## note - turn on multi-line (newline) match (for dot (.))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
## check for "literal" (multi-line) note blocks
|
|
14
|
+
## eg. nb: or note:
|
|
15
|
+
## space required after double colon - why? why not?
|
|
16
|
+
PREPROC_NOTA_BENE_RE = %r{
|
|
17
|
+
^
|
|
18
|
+
[ ]* (?: nb | note) [ ]* : [ ]+
|
|
19
|
+
.+? ## non-greedy
|
|
20
|
+
|
|
21
|
+
## positive lookahead
|
|
22
|
+
## note - must end with blank line or end-of-file/document
|
|
23
|
+
(?= \n[ ]*\n
|
|
24
|
+
| \z
|
|
25
|
+
)
|
|
26
|
+
}xim
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
## note - [] block may NOT incl. square brackets
|
|
32
|
+
## what about comments (e.g. #)?
|
|
33
|
+
## todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
|
|
34
|
+
PREPROC_BLOCK_RE = %r{ \[
|
|
35
|
+
[^\[\]\#]*? ## note - use non-greedy/lazy *? match
|
|
36
|
+
\]
|
|
37
|
+
}xm ## note - turn on multi-line match (for dot(.))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _prep_doc( txt )
|
|
46
|
+
## preprocess automagically
|
|
47
|
+
## strip html comments
|
|
48
|
+
## keep empty lines? - yes (turn in BLANK tokens)
|
|
49
|
+
## keep leading spaces (indent) - yes (maybe used later in upstream parser!!)
|
|
50
|
+
##
|
|
51
|
+
## note - KEEP empty lines (get turned into BLANK token!!!!)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
### normalize unicode (decomposed chars to composed chars)
|
|
55
|
+
##
|
|
56
|
+
## note: é is decomposed (in two chars e.g.)
|
|
57
|
+
## e (101)
|
|
58
|
+
## ́ (769)
|
|
59
|
+
## vs
|
|
60
|
+
## é (233)
|
|
61
|
+
txt = txt.unicode_normalize(:nfc)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
## "universal" newlines
|
|
65
|
+
## replace all windows-style cr+lf (\r\n) to lf (\n) only
|
|
66
|
+
txt = txt.gsub( "\r\n", "\n" )
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
###
|
|
71
|
+
## quick hack for now
|
|
72
|
+
## remove html-style comments <!-- -->
|
|
73
|
+
## (incl. multi-line) with two spaces
|
|
74
|
+
## will mess-up lineno tracking!!!
|
|
75
|
+
## fix later to have function lineno & colno!!!
|
|
76
|
+
##
|
|
77
|
+
## todo/fix - why? why not?
|
|
78
|
+
## to keep lineno intact
|
|
79
|
+
## replace with space and newline
|
|
80
|
+
|
|
81
|
+
###
|
|
82
|
+
## add more "native" multi-line comment-styles
|
|
83
|
+
## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >>
|
|
84
|
+
## or such - why? why not?
|
|
85
|
+
|
|
86
|
+
txt = txt.gsub( HTML_COMMENT_RE ) do |m|
|
|
87
|
+
_trace('preproc html comment:', m )
|
|
88
|
+
' '
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
|
|
94
|
+
if m.include?( "\n" ) ## check for newlines (\n) and replace
|
|
95
|
+
_trace('preproc (multi-line) note/nota bene block:', m )
|
|
96
|
+
m.gsub( "\n", '↵' )
|
|
97
|
+
else
|
|
98
|
+
m
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
#####
|
|
104
|
+
## (another) quick hack for now
|
|
105
|
+
## turn multi-line note blocks into
|
|
106
|
+
## single-line note blocks
|
|
107
|
+
## by changing newline (\n) to ⏎ (unicode U+23CE)
|
|
108
|
+
## or why not to ___ ?
|
|
109
|
+
##
|
|
110
|
+
## unicode options for return/arrows:
|
|
111
|
+
## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
|
|
112
|
+
## This is the most common "carriage return" symbol.
|
|
113
|
+
## - ⏎ (U+23CE): Return Symbol.
|
|
114
|
+
## Specifically designated as the keyboard's "Return" key symbol,
|
|
115
|
+
## often used in user interfaces.
|
|
116
|
+
|
|
117
|
+
txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
|
|
118
|
+
if m.include?( "\n" ) ## check for newlines (\n) and replace
|
|
119
|
+
_trace( 'preproc (multi-line) block:', m )
|
|
120
|
+
m.gsub( "\n", '↵' )
|
|
121
|
+
else
|
|
122
|
+
m
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
txt
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
end # class Lexer
|
|
131
|
+
end # module SportDb
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
######
|
|
6
|
+
## auto-fix checks line-by-line
|
|
7
|
+
|
|
8
|
+
def _prep_line( line )
|
|
9
|
+
|
|
10
|
+
##
|
|
11
|
+
## first check for tabs
|
|
12
|
+
## add error/warn
|
|
13
|
+
## for auto-fix - replace tabs with two spaces
|
|
14
|
+
|
|
15
|
+
line = line.gsub( "\t" ) do |_|
|
|
16
|
+
## report error here
|
|
17
|
+
## todo/add error here
|
|
18
|
+
_warn( "auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}" )
|
|
19
|
+
' ' ## replace with two spaces
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
## U+00A0 (160) -- non-breaking space (unicode)
|
|
24
|
+
line = line.gsub( "\u00A0" ) do |uni|
|
|
25
|
+
## report error here
|
|
26
|
+
## todo/add error here
|
|
27
|
+
_warn( "auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}" )
|
|
28
|
+
' ' ## replace with space
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
###
|
|
32
|
+
## todo/fix - print unicode numbers for [–−]
|
|
33
|
+
## different candidates to differentiate and document!!!
|
|
34
|
+
## – => U+2013 (8211) -- En Dash (unicode)
|
|
35
|
+
## − => U+2212 (8722) -- Minus Sign (unicode)
|
|
36
|
+
line = line.gsub( /[–−]/ ) do |uni|
|
|
37
|
+
## report error here
|
|
38
|
+
## todo/add error here
|
|
39
|
+
_warn( "auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}" )
|
|
40
|
+
'-' ## replace with ascii dash (-)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
#### add more unsmart quotes
|
|
44
|
+
## smart quotes
|
|
45
|
+
line = line.gsub( /[‘’]/ ) do |uni|
|
|
46
|
+
## report error here
|
|
47
|
+
## todo/add error here
|
|
48
|
+
_warn( "auto-fix; replacing unicode (smart) quote (#{uni}/#{uni.ord}) w/ ascii quote ('/#{"'".ord}) in line #{line.inspect}" )
|
|
49
|
+
"'"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
line = line.gsub( /[“”]/ ) do |uni|
|
|
53
|
+
## report error here
|
|
54
|
+
## todo/add error here
|
|
55
|
+
_warn( %Q{auto-fix; replacing unicode (smart) double quote (#{uni}/#{uni.ord}) w/ ascii double quote ("/#{'"'.ord}) in line #{line.inspect}} )
|
|
56
|
+
'"'
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
line
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end # class Lexer
|
|
63
|
+
end # module SportDb
|
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
###
|
|
6
|
+
## use nested class for context - why? why not?
|
|
7
|
+
## note: first arg passed in MUST be ref to lexer (instance)
|
|
8
|
+
class Context
|
|
9
|
+
## passed along to on_round_def etc. handlers in tokenize_line
|
|
10
|
+
## note - for now only offset (in line begin/end) gets updated !!!
|
|
11
|
+
attr_writer :offset
|
|
12
|
+
attr_reader :lineno
|
|
13
|
+
|
|
14
|
+
def initialize( lexer,
|
|
15
|
+
line:,
|
|
16
|
+
lineno:,
|
|
17
|
+
errors: )
|
|
18
|
+
@lexer = lexer
|
|
19
|
+
@line = line
|
|
20
|
+
@lineno = lineno
|
|
21
|
+
@errors = errors
|
|
22
|
+
|
|
23
|
+
@offset = [0,0] ## or use [] aka [nil,nil] for not defined??? why? why not?
|
|
24
|
+
## @offset = offset ## MatchData offset e.g. [m.begin(0),m.end(0)]
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def warn_on_else( match, mode: 'TOP' )
|
|
30
|
+
if match[:any]
|
|
31
|
+
_add_warn( "unexpected char >#{match[:any]}< (#{mode})" )
|
|
32
|
+
else
|
|
33
|
+
## internal error - shouldn't really happen
|
|
34
|
+
_add_warn( "internal error - unknown match (#{mode}): #{match.inspect}")
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _add_warn( msg )
|
|
40
|
+
## note - warns gets logged as error for now too
|
|
41
|
+
## maybe add @warns later - why? why not?
|
|
42
|
+
##
|
|
43
|
+
## note - add +1 to offset (start at one - not zero-based)
|
|
44
|
+
## will match with (external) text editors
|
|
45
|
+
msg = "parse error (tokenize) - " +
|
|
46
|
+
msg +
|
|
47
|
+
" in line @#{@lineno}:#{@offset[0]+1},#{@offset[1]+1} >#{@line}< "
|
|
48
|
+
|
|
49
|
+
@errors << msg
|
|
50
|
+
@lexer.log( "!! WARN - #{msg}" )
|
|
51
|
+
|
|
52
|
+
@lexer._warn( msg )
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
=begin
|
|
56
|
+
## use report/log/??_parses_error
|
|
57
|
+
def _add_error( msg )
|
|
58
|
+
msg = "parse error (tokenize) -" +
|
|
59
|
+
msg +
|
|
60
|
+
" in line #{@lineno}@#{@offset[0]},#{@offse[1]} >#{@line}< "
|
|
61
|
+
|
|
62
|
+
@errors << msg
|
|
63
|
+
end
|
|
64
|
+
=end
|
|
65
|
+
|
|
66
|
+
end # class Context
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _tokenize_line( line, lineno )
|
|
73
|
+
tokens = []
|
|
74
|
+
errors = [] ## keep a list of errors - why? why not?
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
pos = 0 ## note - usually same as offset[1] aka offset[end] after match
|
|
78
|
+
## track last offset (begin/end) - to report error on no match
|
|
79
|
+
## or no match in end of string
|
|
80
|
+
offset = [0,0]
|
|
81
|
+
m = nil
|
|
82
|
+
|
|
83
|
+
## track number of geo text seen
|
|
84
|
+
## (use for - do NOT break on two spaces if no geo text seen yet!!)
|
|
85
|
+
@geo_count = 0
|
|
86
|
+
|
|
87
|
+
####
|
|
88
|
+
## quick hack - keep re state/mode between tokenize calls!!!
|
|
89
|
+
@re ||= RE ## note - switch between RE & INSIDE_RE
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if @re == RE ## top-level
|
|
93
|
+
### check for modes once (per line) here to speed-up parsing
|
|
94
|
+
### for now goals only possible for start of line!!
|
|
95
|
+
### fix - remove optional [] - why? why not?
|
|
96
|
+
|
|
97
|
+
####
|
|
98
|
+
## note - ord e.g. (45) for match number can only start a (match) line
|
|
99
|
+
## "inline" use NOT possible
|
|
100
|
+
## note - ord (for ordinal number!!!) e.g match number (1), (42), etc.
|
|
101
|
+
if (m = START_WITH_ORD.match(line))
|
|
102
|
+
## note - strip enclosing () and convert to integer
|
|
103
|
+
tokens << Token.new(:ORD, m[:ord],
|
|
104
|
+
lineno: lineno, offset: m.offset(:ord),
|
|
105
|
+
value: m[:value].to_i(10) )
|
|
106
|
+
|
|
107
|
+
offset = m.offset(0)
|
|
108
|
+
pos = offset[1] ## update pos
|
|
109
|
+
elsif (m = START_WITH_YEAR.match(line))
|
|
110
|
+
tokens << Token.new(:YEAR, m[:year],
|
|
111
|
+
lineno: lineno, offset: m.offset(:year),
|
|
112
|
+
value: m[:year].to_i(10) )
|
|
113
|
+
|
|
114
|
+
offset = m.offset(0)
|
|
115
|
+
pos = offset[1] ## update pos
|
|
116
|
+
|
|
117
|
+
elsif (m = START_WITH_GROUP_DEF_LINE_RE.match( line ))
|
|
118
|
+
_trace( "ENTER GROUP_DEF_RE MODE" )
|
|
119
|
+
@re = GROUP_DEF_RE
|
|
120
|
+
|
|
121
|
+
tokens << Token.new( :GROUP_DEF, m[:group_def],
|
|
122
|
+
lineno: lineno, offset: m.offset(:group_def) )
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
offset = m.offset(0)
|
|
126
|
+
pos = offset[1] ## update pos
|
|
127
|
+
|
|
128
|
+
elsif (m = START_WITH_PROP_KEY_RE.match( line ))
|
|
129
|
+
## start with prop key (match will switch into prop mode!!!)
|
|
130
|
+
## - fix - remove leading spaces in regex (upstream) - why? why not?
|
|
131
|
+
##
|
|
132
|
+
### switch into new mode
|
|
133
|
+
## switch context to PROP_RE
|
|
134
|
+
_trace("ENTER PROP_RE MODE" )
|
|
135
|
+
key = m[:key]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
### todo/fix - add prop yellow/red cards too - why? why not?
|
|
139
|
+
## todo/fix - separate sent off and red card
|
|
140
|
+
## sent-off - incl. red card, yellow/red card and the era before red cards!!
|
|
141
|
+
if ['sent off'].include?( key.downcase)
|
|
142
|
+
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
|
143
|
+
tokens << Token.new(:PROP_SENTOFF, m[:key],
|
|
144
|
+
lineno: lineno, offset: m.offset(:key))
|
|
145
|
+
elsif ['red cards'].include?( key.downcase )
|
|
146
|
+
@re = PROP_CARDS_RE ## use CARDS_RE ???
|
|
147
|
+
tokens << Token.new(:PROP_REDCARDS, m[:key],
|
|
148
|
+
lineno: lineno, offset: m.offset(:key))
|
|
149
|
+
elsif ['yellow cards'].include?( key.downcase )
|
|
150
|
+
@re = PROP_CARDS_RE
|
|
151
|
+
tokens << Token.new(:PROP_YELLOWCARDS, m[:key],
|
|
152
|
+
lineno: lineno, offset: m.offset(:key))
|
|
153
|
+
elsif ['ref', 'referee',
|
|
154
|
+
'refs', 'referees' ## note - allow/support assistant refs
|
|
155
|
+
].include?( key.downcase )
|
|
156
|
+
@re = PROP_REFEREE_RE
|
|
157
|
+
tokens << Token.new(:PROP_REFEREE, m[:key],
|
|
158
|
+
lineno: lineno, offset: m.offset(:key))
|
|
159
|
+
elsif ['att', 'attn', 'attendance'].include?( key.downcase )
|
|
160
|
+
@re = PROP_ATTENDANCE_RE
|
|
161
|
+
tokens << Token.new(:PROP_ATTENDANCE, m[:key],
|
|
162
|
+
lineno: lineno, offset: m.offset(:key))
|
|
163
|
+
|
|
164
|
+
# elsif ['goals'].include?( key.downcase )
|
|
165
|
+
# @re = PROP_GOAL_RE
|
|
166
|
+
# tokens << [:PROP_GOALS, m[:key]]
|
|
167
|
+
|
|
168
|
+
elsif ['penalties',
|
|
169
|
+
'penalty shootout',
|
|
170
|
+
'penalty shoot-out',
|
|
171
|
+
'penalty kicks'].include?( key.downcase )
|
|
172
|
+
@re = PROP_PENALTIES_RE
|
|
173
|
+
tokens << Token.new(:PROP_PENALTIES, m[:key],
|
|
174
|
+
lineno: lineno, offset: m.offset(:key))
|
|
175
|
+
else ## assume (team) line-up
|
|
176
|
+
@re = PROP_LINEUP_RE
|
|
177
|
+
## fix-fix-fix - rename to PROP_LINEUP !!
|
|
178
|
+
tokens << Token.new(:PROP, m[:key],
|
|
179
|
+
lineno: lineno, offset: m.offset(:key))
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
offset = m.offset(0)
|
|
183
|
+
pos = offset[1] ## update pos
|
|
184
|
+
###
|
|
185
|
+
### todo/fix
|
|
186
|
+
### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
|
|
187
|
+
elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
|
|
188
|
+
_trace( "ENTER ROUND_DEF_RE MODE" )
|
|
189
|
+
@re = ROUND_DEF_RE
|
|
190
|
+
|
|
191
|
+
## note - return ROUND_DEF NOT ROUND_OUTLINE token
|
|
192
|
+
## fix - add leading ▪ too!!
|
|
193
|
+
tokens << Token.new( :ROUND_DEF, m[:round_outline],
|
|
194
|
+
lineno: lineno, offset: m.offset(:round_outline))
|
|
195
|
+
|
|
196
|
+
offset = m.offset(0)
|
|
197
|
+
pos = offset[1] ## update pos
|
|
198
|
+
elsif (m = ROUND_OUTLINE_RE.match( line ))
|
|
199
|
+
_trace( "ROUND_OUTLINE" )
|
|
200
|
+
## note - derive round level from no of (leading) markers
|
|
201
|
+
## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
|
|
202
|
+
## note - ascii-style starts with double ::, thus, autodecrement by one!
|
|
203
|
+
round_level = m[:round_marker].size
|
|
204
|
+
round_level -= 1 if m[:round_marker].start_with?( '::' )
|
|
205
|
+
|
|
206
|
+
tokens << Token.new( :ROUND_OUTLINE, m[:round_outline],
|
|
207
|
+
lineno: lineno, offset: m.offset(:round_outline),
|
|
208
|
+
value: { outline: m[:round_outline],
|
|
209
|
+
level: round_level})
|
|
210
|
+
|
|
211
|
+
## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
|
|
212
|
+
offset = m.offset(0)
|
|
213
|
+
pos = offset[1] ## update pos
|
|
214
|
+
elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume
|
|
215
|
+
## switch context to GOAL_RE (goalline(s))
|
|
216
|
+
####
|
|
217
|
+
## note - check for alternate goal line styles / formats
|
|
218
|
+
if START_GOAL_LINE_COMPAT_RE.match(line )
|
|
219
|
+
## "legacy" style starting with minute e.g.
|
|
220
|
+
## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
|
|
221
|
+
## 84 Rahn 3-2)
|
|
222
|
+
@re = GOAL_COMPAT_RE
|
|
223
|
+
_trace( "ENTER GOAL_COMPAT_RE MODE" )
|
|
224
|
+
|
|
225
|
+
tokens << Token.virtual( :GOALS_COMPAT, lineno: lineno )
|
|
226
|
+
elsif START_GOAL_LINE_ALT_RE.match( line )
|
|
227
|
+
## goals with scores e.g.
|
|
228
|
+
## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
|
|
229
|
+
## -or-
|
|
230
|
+
## (Dion Beljo 1-0
|
|
231
|
+
## 1-1 Andreas Gruber
|
|
232
|
+
## Matthias Seidl 2-1)
|
|
233
|
+
@re = GOAL_ALT_RE
|
|
234
|
+
_trace( "ENTER GOAL_ALT_RE MODE" )
|
|
235
|
+
|
|
236
|
+
tokens << Token.virtual( :GOALS_ALT, lineno: lineno )
|
|
237
|
+
else
|
|
238
|
+
## "standard" / default style
|
|
239
|
+
@re = GOAL_RE
|
|
240
|
+
_trace( "ENTER GOAL_RE MODE" )
|
|
241
|
+
|
|
242
|
+
tokens << Token.virtual( :GOALS, lineno: lineno )
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
## note - eat-up ( for now
|
|
246
|
+
## pass along "virtual" GOALS or GOALS_ALT token
|
|
247
|
+
## (see INLINE_GOALS for the starting goal line inline)
|
|
248
|
+
##
|
|
249
|
+
## fix-fix-fix
|
|
250
|
+
## keep offset at [0,0] - why? why not?
|
|
251
|
+
## do NOT eat-up
|
|
252
|
+
## or better
|
|
253
|
+
## add tokens << Token.literal( '(', lineno: lineno, offset: ...) !!!
|
|
254
|
+
offset = m.offset(0)
|
|
255
|
+
pos = offset[1] ## update pos
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
old_pos = -1 ## allows to backtrack to old pos (used in geo)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
ctx = Context.new( self,
|
|
267
|
+
line: line,
|
|
268
|
+
lineno: lineno,
|
|
269
|
+
errors: errors )
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
while m = @re.match( line, pos )
|
|
273
|
+
# if debug?
|
|
274
|
+
# pp m
|
|
275
|
+
# puts "pos: #{pos}"
|
|
276
|
+
# end
|
|
277
|
+
offset = m.offset(0)
|
|
278
|
+
ctx.offset = offset
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
if offset[0] != pos
|
|
283
|
+
## match NOT starting at start/begin position!!!
|
|
284
|
+
## report parse error!!!
|
|
285
|
+
msg = "parse error (tokenize) - skipping >#{line[pos..(offset[0]-1)]}< in line #{lineno}@#{offset[0]},#{offset[1]} >#{line}<"
|
|
286
|
+
errors << msg
|
|
287
|
+
|
|
288
|
+
log( msg )
|
|
289
|
+
puts "!! WARN - #{msg}"
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
##
|
|
294
|
+
## todo/fix - also check if possible
|
|
295
|
+
## if no match but not yet end off string!!!!
|
|
296
|
+
## report skipped text run too!!!
|
|
297
|
+
|
|
298
|
+
old_pos = pos
|
|
299
|
+
pos = offset[1]
|
|
300
|
+
|
|
301
|
+
# pp offset if debug?
|
|
302
|
+
|
|
303
|
+
##
|
|
304
|
+
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
|
305
|
+
## for VAL use "text" or ["text", { opts }] array
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
t = if @re == ROUND_DEF_RE then _on_round_def( m, ctx: ctx )
|
|
310
|
+
elsif @re == GROUP_DEF_RE then _on_group_def( m, ctx: ctx )
|
|
311
|
+
elsif @re == GEO_RE
|
|
312
|
+
### note - possibly end inline geo on [ (and others?? in the future
|
|
313
|
+
## note: break on double spaces e.g.
|
|
314
|
+
## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England
|
|
315
|
+
if m[:spaces]
|
|
316
|
+
### note - do NOT break out
|
|
317
|
+
## if not text seen yet!!!
|
|
318
|
+
if @geo_count > 0
|
|
319
|
+
## get out-off geo mode and backtrack (w/ next)
|
|
320
|
+
##
|
|
321
|
+
## todo/fix
|
|
322
|
+
## add virtual geo_end token!!!
|
|
323
|
+
_trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
|
|
324
|
+
@re = RE
|
|
325
|
+
pos = old_pos
|
|
326
|
+
next ## backtrack (resume new loop step)
|
|
327
|
+
else
|
|
328
|
+
nil ## skip spaces
|
|
329
|
+
end
|
|
330
|
+
elsif m[:space]
|
|
331
|
+
nil ## skip (single) space
|
|
332
|
+
elsif m[:text]
|
|
333
|
+
@geo_count += 1
|
|
334
|
+
## keep pos - why? why not?
|
|
335
|
+
Token.new(:GEO, m[:text],
|
|
336
|
+
lineno: lineno, offset: m.offset(:text))
|
|
337
|
+
elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!!
|
|
338
|
+
## get out-off geo mode and backtrack (w/ next)
|
|
339
|
+
## todo/fix
|
|
340
|
+
## add (semi-) virtual geo_end token!!!
|
|
341
|
+
_trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
|
|
342
|
+
@re = RE
|
|
343
|
+
pos = old_pos
|
|
344
|
+
next ## backtrack (resume new loop step)
|
|
345
|
+
elsif m[:sym]
|
|
346
|
+
case m[:sym]
|
|
347
|
+
## note - reset geo_count to 0 (avoids break on two spaces)
|
|
348
|
+
## if separator seen!!
|
|
349
|
+
when ',' then @geo_count = 0
|
|
350
|
+
Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
|
|
351
|
+
when '›' then @geo_count = 0;
|
|
352
|
+
Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
|
|
353
|
+
## note - treat geo sep › (unicode) like comma for now!!!
|
|
354
|
+
when '>' then @geo_count = 0;
|
|
355
|
+
Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
|
|
356
|
+
## note - treat geo sep > (ascii) like comma for now!!!
|
|
357
|
+
when '[' then
|
|
358
|
+
##
|
|
359
|
+
## todo/fix
|
|
360
|
+
## add virtual geo_end token!!!
|
|
361
|
+
## get out-off geo mode and backtrack (w/ next)
|
|
362
|
+
_trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
|
|
363
|
+
@re = RE
|
|
364
|
+
pos = old_pos
|
|
365
|
+
next ## backtrack (resume new loop step)
|
|
366
|
+
else
|
|
367
|
+
Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
|
|
368
|
+
end
|
|
369
|
+
else
|
|
370
|
+
ctx.warn_on_else( m, mode: 'GEO' )
|
|
371
|
+
nil
|
|
372
|
+
end
|
|
373
|
+
elsif @re == PROP_CARDS_RE then _on_prop_cards( m, ctx: ctx )
|
|
374
|
+
elsif @re == PROP_LINEUP_RE then _on_prop_lineup( m, ctx: ctx )
|
|
375
|
+
elsif @re == PROP_ATTENDANCE_RE then _on_prop_attendance( m, ctx: ctx )
|
|
376
|
+
elsif @re == PROP_REFEREE_RE then _on_prop_referee( m, ctx: ctx )
|
|
377
|
+
elsif @re == PROP_PENALTIES_RE then _on_prop_penalties( m, ctx: ctx )
|
|
378
|
+
elsif @re == GOAL_COMPAT_RE then _on_goal_compat( m, ctx: ctx )
|
|
379
|
+
elsif @re == GOAL_ALT_RE then _on_goal_alt( m, ctx: ctx )
|
|
380
|
+
elsif @re == GOAL_RE then _on_goal( m, ctx: ctx )
|
|
381
|
+
###################################################
|
|
382
|
+
## assume TOP_LEVEL (a.k.a. RE) machinery
|
|
383
|
+
else
|
|
384
|
+
_on_top( m, ctx: ctx )
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
tokens << t if t
|
|
389
|
+
|
|
390
|
+
# if debug?
|
|
391
|
+
# print ">"
|
|
392
|
+
# print "*" * pos
|
|
393
|
+
# puts "#{line[pos..-1]}<"
|
|
394
|
+
# end
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
## check if no match in end of string
|
|
398
|
+
if offset[1] != line.size
|
|
399
|
+
msg = "parse error (tokenize) - skipping >#{line[offset[1]..-1]}< in line #{lineno}@#{offset[1]},#{line.size} >#{line}<"
|
|
400
|
+
errors << msg
|
|
401
|
+
|
|
402
|
+
log( msg )
|
|
403
|
+
puts "!! WARN - #{msg}"
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# if @re == GOAL_RE ### ALWAYS switch back to top level mode
|
|
408
|
+
# puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug?
|
|
409
|
+
# @re = RE
|
|
410
|
+
# end
|
|
411
|
+
|
|
412
|
+
if @re == GEO_RE ### ALWAYS switch back to top level mode
|
|
413
|
+
_trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
|
|
414
|
+
@re = RE
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
### ALWAYS switch back to top level mode
|
|
418
|
+
@re = RE if @re == GROUP_DEF_RE ||
|
|
419
|
+
@re == ROUND_DEF_RE
|
|
420
|
+
|
|
421
|
+
##
|
|
422
|
+
## if in prop mode continue if last token is [,-]
|
|
423
|
+
## otherwise change back to "standard" mode
|
|
424
|
+
if @re == PROP_LINEUP_RE ||
|
|
425
|
+
@re == PROP_CARDS_RE ||
|
|
426
|
+
@re == PROP_PENALTIES_RE ||
|
|
427
|
+
@re == PROP_ATTENDANCE_RE ||
|
|
428
|
+
@re == PROP_REFEREE_RE
|
|
429
|
+
if [',', '-', ';'].include?( tokens[-1].type)
|
|
430
|
+
## continue/stay in PROP_RE mode
|
|
431
|
+
## todo/check - auto-add PROP_CONT token or such
|
|
432
|
+
## to help parser with possible NEWLINE
|
|
433
|
+
## conflicts - why? why not?
|
|
434
|
+
else
|
|
435
|
+
## switch back to top-level mode!!
|
|
436
|
+
_trace( "LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" )
|
|
437
|
+
@re = RE
|
|
438
|
+
## note - auto-add PROP_END (<PROP_END>)
|
|
439
|
+
tokens << Token.virtual(:PROP_END, lineno: lineno)
|
|
440
|
+
end
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
[tokens,errors]
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
end ## class Lexer
|
|
449
|
+
end ## module SportDb
|