sportdb-parser 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/README.md +0 -5
- data/Rakefile +1 -0
- data/lib/sportdb/parser/parser.rb +631 -212
- data/lib/sportdb/parser/token-text.rb +1 -1
- data/lib/sportdb/parser/token.rb +58 -56
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +238 -0
- metadata +16 -2
data/lib/sportdb/parser/token.rb
CHANGED
@@ -68,24 +68,41 @@ BASICS_RE = %r{
|
|
68
68
|
(?<vs>
|
69
69
|
(?<=[ ]) # Positive lookbehind for space
|
70
70
|
(?:
|
71
|
-
vs
|
72
|
-
|
73
|
-
|
74
|
-
|
71
|
+
vs|v
|
72
|
+
)
|
73
|
+
# not bigger match first e.g. vs than v etc.
|
74
|
+
# todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
|
75
75
|
(?=[ ]) # positive lookahead for space
|
76
76
|
)
|
77
77
|
|
|
78
|
+
(?<spaces> [ ]{2,}) |
|
79
|
+
(?<space> [ ])
|
80
|
+
|
|
81
|
+
(?<sym>[;,@|\[\]-])
|
82
|
+
}ix
|
83
|
+
|
84
|
+
|
85
|
+
## removed from basics
|
86
|
+
=begin
|
78
87
|
(?<none>
|
79
88
|
(?<=[ \[]|^) # Positive lookbehind for space or [
|
80
89
|
-
|
81
90
|
(?=[ ]*;) # positive lookahead for space
|
82
91
|
)
|
83
92
|
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
93
|
+
(?<vs>
|
94
|
+
(?<=[ ]) # Positive lookbehind for space
|
95
|
+
(?:
|
96
|
+
vs\.?| ## allow optional dot (eg. vs. v.)
|
97
|
+
v\.?|
|
98
|
+
-
|
99
|
+
) # not bigger match first e.g. vs than v etc.
|
100
|
+
(?=[ ]) # positive lookahead for space
|
101
|
+
)
|
102
|
+
|
|
103
|
+
|
104
|
+
make - into a simple symbol !!!
|
105
|
+
=end
|
89
106
|
|
90
107
|
|
91
108
|
MINUTE_RE = %r{
|
@@ -141,8 +158,7 @@ end
|
|
141
158
|
|
142
159
|
|
143
160
|
|
144
|
-
def tokenize_with_errors( line,
|
145
|
-
debug: false )
|
161
|
+
def tokenize_with_errors( line, debug: false )
|
146
162
|
tokens = []
|
147
163
|
errors = [] ## keep a list of errors - why? why not?
|
148
164
|
|
@@ -180,6 +196,10 @@ def tokenize_with_errors( line, typed: false,
|
|
180
196
|
|
181
197
|
pp offsets if debug
|
182
198
|
|
199
|
+
##
|
200
|
+
## note: racc requires pairs e.g. [:TOKEN, VAL]
|
201
|
+
## for VAL use "text" or ["text", { opts }] array
|
202
|
+
|
183
203
|
t = if m[:space]
|
184
204
|
## skip space
|
185
205
|
nil
|
@@ -187,15 +207,17 @@ def tokenize_with_errors( line, typed: false,
|
|
187
207
|
## skip spaces
|
188
208
|
nil
|
189
209
|
elsif m[:text]
|
190
|
-
[:
|
210
|
+
[:TEXT, m[:text]] ## keep pos - why? why not?
|
191
211
|
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
212
|
+
## todo/check - add text (or status)
|
213
|
+
# to opts hash {} by default (for value)
|
192
214
|
if m[:status_note] ## includes note? e.g. awarded; originally 2-0
|
193
|
-
[:
|
215
|
+
[:STATUS, [m[:status], {status: m[:status],
|
216
|
+
note: m[:status_note]} ]]
|
194
217
|
else
|
195
|
-
[:status, m[:status]]
|
218
|
+
[:STATUS, [m[:status], {status: m[:status] } ]]
|
196
219
|
end
|
197
220
|
elsif m[:time]
|
198
|
-
if typed
|
199
221
|
## unify to iso-format
|
200
222
|
### 12.40 => 12:40
|
201
223
|
## 12h40 => 12:40 etc.
|
@@ -208,15 +230,11 @@ def tokenize_with_errors( line, typed: false,
|
|
208
230
|
(minute >=0 && minute <= 59)
|
209
231
|
## note - for debugging keep (pass along) "literal" time
|
210
232
|
## might use/add support for am/pm later
|
211
|
-
[:
|
233
|
+
[:TIME, [m[:time], {h:hour,m:minute}]]
|
212
234
|
else
|
213
235
|
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
214
236
|
end
|
215
|
-
else
|
216
|
-
[:time, m[:time]]
|
217
|
-
end
|
218
237
|
elsif m[:date]
|
219
|
-
if typed
|
220
238
|
date = {}
|
221
239
|
=begin
|
222
240
|
((?<day_name>#{DAY_NAMES})
|
@@ -237,14 +255,11 @@ def tokenize_with_errors( line, typed: false,
|
|
237
255
|
date[:d] = m[:day].to_i(10) if m[:day]
|
238
256
|
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
239
257
|
## note - for debugging keep (pass along) "literal" date
|
240
|
-
[:
|
241
|
-
else
|
242
|
-
[:date, m[:date]]
|
243
|
-
end
|
258
|
+
[:DATE, [m[:date], date]]
|
244
259
|
elsif m[:timezone]
|
245
|
-
[:
|
260
|
+
[:TIMEZONE, m[:timezone]]
|
246
261
|
elsif m[:duration]
|
247
|
-
|
262
|
+
## todo/check/fix - if end: works for kwargs!!!!!
|
248
263
|
duration = { start: {}, end: {}}
|
249
264
|
duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
|
250
265
|
duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
|
@@ -255,19 +270,11 @@ def tokenize_with_errors( line, typed: false,
|
|
255
270
|
duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
|
256
271
|
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
257
272
|
## note - for debugging keep (pass along) "literal" duration
|
258
|
-
[:
|
259
|
-
|
260
|
-
[:duration, m[:duration]]
|
261
|
-
end
|
262
|
-
elsif m[:num]
|
263
|
-
if typed
|
273
|
+
[:DURATION, [m[:duration], duration]]
|
274
|
+
elsif m[:num] ## fix - change to ord (for ordinal number!!!)
|
264
275
|
## note - strip enclosing () and convert to integer
|
265
|
-
[:num, m[:value].to_i(10)]
|
266
|
-
else
|
267
|
-
[:num, m[:num]]
|
268
|
-
end
|
276
|
+
[:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
|
269
277
|
elsif m[:score]
|
270
|
-
if typed
|
271
278
|
score = {}
|
272
279
|
## check for pen
|
273
280
|
score[:p] = [m[:p1].to_i(10),
|
@@ -280,42 +287,39 @@ def tokenize_with_errors( line, typed: false,
|
|
280
287
|
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
281
288
|
|
282
289
|
## note - for debugging keep (pass along) "literal" score
|
283
|
-
[:
|
284
|
-
else
|
285
|
-
[:score, m[:score]]
|
286
|
-
end
|
290
|
+
[:SCORE, [m[:score], score]]
|
287
291
|
elsif m[:minute]
|
288
|
-
if typed
|
289
292
|
minute = {}
|
290
293
|
minute[:m] = m[:value].to_i(10)
|
291
294
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
292
295
|
## note - for debugging keep (pass along) "literal" minute
|
293
|
-
[:
|
294
|
-
else
|
295
|
-
[:minute, m[:minute]]
|
296
|
-
end
|
296
|
+
[:MINUTE, [m[:minute], minute]]
|
297
297
|
elsif m[:og]
|
298
|
-
|
298
|
+
[:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not?
|
299
299
|
elsif m[:pen]
|
300
|
-
|
300
|
+
[:PEN, m[:pen]]
|
301
301
|
elsif m[:vs]
|
302
|
-
|
303
|
-
elsif m[:none]
|
304
|
-
typed ? [:none] : [:none, m[:none]]
|
302
|
+
[:VS, m[:vs]]
|
305
303
|
elsif m[:sym]
|
306
304
|
sym = m[:sym]
|
307
305
|
## return symbols "inline" as is - why? why not?
|
306
|
+
## (?<sym>[;,@|\[\]-])
|
307
|
+
|
308
308
|
case sym
|
309
309
|
when ',' then [:',']
|
310
310
|
when ';' then [:';']
|
311
311
|
when '@' then [:'@']
|
312
312
|
when '|' then [:'|']
|
313
|
+
when '[' then [:'[']
|
314
|
+
when ']' then [:']']
|
315
|
+
when '-' then [:'-']
|
313
316
|
else
|
314
317
|
nil ## ignore others (e.g. brackets [])
|
315
318
|
end
|
316
319
|
else
|
317
320
|
## report error
|
318
|
-
|
321
|
+
puts "!!! TOKENIZE ERROR - no match found"
|
322
|
+
nil
|
319
323
|
end
|
320
324
|
|
321
325
|
tokens << t if t
|
@@ -342,10 +346,8 @@ end
|
|
342
346
|
|
343
347
|
|
344
348
|
### convience helper - ignore errors by default
|
345
|
-
def tokenize( line,
|
346
|
-
|
347
|
-
tokens, _ = tokenize_with_errors( line, typed: typed,
|
348
|
-
debug: debug )
|
349
|
+
def tokenize( line, debug: false )
|
350
|
+
tokens, _ = tokenize_with_errors( line, debug: debug )
|
349
351
|
tokens
|
350
352
|
end
|
351
353
|
|
data/lib/sportdb/parser.rb
CHANGED
@@ -47,5 +47,243 @@ end # module SportDb
|
|
47
47
|
=end
|
48
48
|
|
49
49
|
|
50
|
+
|
51
|
+
module SportDb
|
52
|
+
class Tokenizer
|
53
|
+
|
54
|
+
attr_reader :tokens
|
55
|
+
|
56
|
+
def initialize( txt )
|
57
|
+
parser = Parser.new
|
58
|
+
|
59
|
+
tree = []
|
60
|
+
|
61
|
+
lines = txt.split( "\n" )
|
62
|
+
lines.each_with_index do |line,i|
|
63
|
+
next if line.strip.empty? || line.strip.start_with?( '#' )
|
64
|
+
|
65
|
+
puts "line >#{line}<"
|
66
|
+
tokens = parser.tokenize( line )
|
67
|
+
pp tokens
|
68
|
+
|
69
|
+
tree << tokens
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
=begin
|
74
|
+
## quick hack
|
75
|
+
## turn all text tokens followed by minute token
|
76
|
+
## into player tokens!!!
|
77
|
+
##
|
78
|
+
## also auto-convert text tokens into team tokens - why? why not?
|
79
|
+
tree.each do |tokens|
|
80
|
+
tokens.each_with_index do |t0,idx|
|
81
|
+
t1 = tokens[idx+1]
|
82
|
+
if t1 && t1[0] == :minute && t0[0] == :text
|
83
|
+
t0[0] = :player
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
=end
|
88
|
+
|
89
|
+
=begin
|
90
|
+
## auto-add/insert start tokens for known line patterns
|
91
|
+
## START_GOALS for goals_line
|
92
|
+
## why? why not?
|
93
|
+
=end
|
94
|
+
|
95
|
+
## flatten
|
96
|
+
@tokens = []
|
97
|
+
tree.each do |tokens|
|
98
|
+
@tokens += tokens
|
99
|
+
@tokens << [:NEWLINE, "\n"] ## auto-add newlines
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
## convert to racc format
|
104
|
+
@tokens = @tokens.map do |tok|
|
105
|
+
if tok.size == 1
|
106
|
+
[tok[0].to_s, tok[0].to_s]
|
107
|
+
elsif tok.size == 2
|
108
|
+
#############
|
109
|
+
## pass 1
|
110
|
+
## replace all texts with keyword matches (e.g. group, round, leg, etc.)
|
111
|
+
if tok[0] == :TEXT
|
112
|
+
text = tok[1]
|
113
|
+
tok = if parser.is_group?( text )
|
114
|
+
[:GROUP, text]
|
115
|
+
elsif parser.is_round?( text ) || parser.is_leg?( text )
|
116
|
+
[:ROUND, text]
|
117
|
+
else
|
118
|
+
tok ## pass through as-is (1:1)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
## pass 2
|
122
|
+
tok
|
123
|
+
else
|
124
|
+
raise ArgumentError, "tokens of size 1|2 expected; got #{tok.pretty_inspect}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
def next_token
|
132
|
+
@tokens.shift
|
133
|
+
end
|
134
|
+
end # class Tokenizer
|
135
|
+
end # module SportDb
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
####
|
140
|
+
# RaccMatchParser support machinery (incl. node classes/abstract syntax tree)
|
141
|
+
|
142
|
+
class RaccMatchParser
|
143
|
+
|
144
|
+
GroupDef = Struct.new( :name, :teams ) do
|
145
|
+
def pretty_print( printer )
|
146
|
+
printer.text( "<GroupDef " )
|
147
|
+
printer.text( self.name )
|
148
|
+
printer.text( " teams=" + self.teams.pretty_inspect )
|
149
|
+
printer.text( ">" )
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
RoundDef = Struct.new( :name, :date, :duration ) do
|
155
|
+
def pretty_print( printer )
|
156
|
+
printer.text( "<RoundDef " )
|
157
|
+
printer.text( self.name )
|
158
|
+
printer.text( " date=" + self.date.pretty_inspect ) if date
|
159
|
+
printer.text( " durattion=" + self.duration.pretty_inspect ) if duration
|
160
|
+
printer.text( ">" )
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
DateHeader = Struct.new( :date ) do
|
165
|
+
def pretty_print( printer )
|
166
|
+
printer.text( "<DateHeader " )
|
167
|
+
printer.text( "#{self.date.pretty_inspect}>" )
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
GroupHeader = Struct.new( :name ) do
|
172
|
+
def pretty_print( printer )
|
173
|
+
printer.text( "<GroupHeader " )
|
174
|
+
printer.text( "#{self.name}>" )
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
RoundHeader = Struct.new( :names ) do
|
179
|
+
def pretty_print( printer )
|
180
|
+
printer.text( "<RoundHeader " )
|
181
|
+
printer.text( "#{self.names.join(', ')}>" )
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
MatchLine = Struct.new( :ord, :date, :time,
|
186
|
+
:team1, :team2, :score,
|
187
|
+
:geo ) do ## change to geos - why? why not?
|
188
|
+
|
189
|
+
def pretty_print( printer )
|
190
|
+
printer.text( "<MatchLine " )
|
191
|
+
printer.text( "#{self.team1} v #{self.team2}")
|
192
|
+
printer.breakable
|
193
|
+
|
194
|
+
members.zip(values) do |name, value|
|
195
|
+
next if [:team1, :team2].include?( name )
|
196
|
+
next if value.nil?
|
197
|
+
|
198
|
+
printer.text( "#{name}=#{value.pretty_inspect}" )
|
199
|
+
end
|
200
|
+
|
201
|
+
printer.text( ">" )
|
202
|
+
end
|
203
|
+
|
204
|
+
end
|
205
|
+
|
206
|
+
GoalLine = Struct.new( :goals1, :goals2 ) do
|
207
|
+
def pretty_print( printer )
|
208
|
+
printer.text( "<GoalLine " )
|
209
|
+
printer.text( "goals1=" + self.goals1.pretty_inspect + "," )
|
210
|
+
printer.breakable
|
211
|
+
printer.text( "goals2=" + self.goals2.pretty_inspect + ">" )
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
Goal = Struct.new( :player, :minutes ) do
|
216
|
+
def to_s
|
217
|
+
buf = String.new
|
218
|
+
buf << "#{self.player}"
|
219
|
+
buf << " "
|
220
|
+
buf << minutes.map { |min| min.to_s }.join(' ')
|
221
|
+
buf
|
222
|
+
end
|
223
|
+
|
224
|
+
def pretty_print( printer )
|
225
|
+
printer.text( to_s )
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
|
230
|
+
Minute = Struct.new( :m, :offset, :og, :pen ) do
|
231
|
+
def to_s
|
232
|
+
buf = String.new
|
233
|
+
buf << "#{self.m}"
|
234
|
+
buf << "+#{self.offset}" if self.offset
|
235
|
+
buf << "'"
|
236
|
+
buf << "(og)" if self.og
|
237
|
+
buf << "(pen)" if self.pen
|
238
|
+
buf
|
239
|
+
end
|
240
|
+
|
241
|
+
def pretty_print( printer )
|
242
|
+
printer.text( to_s )
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
def initialize(input)
|
250
|
+
puts "==> input:"
|
251
|
+
puts input
|
252
|
+
@tokenizer = SportDb::Tokenizer.new(input)
|
253
|
+
end
|
254
|
+
|
255
|
+
|
256
|
+
def next_token
|
257
|
+
tok = @tokenizer.next_token
|
258
|
+
puts "next_token => #{tok.pretty_inspect}"
|
259
|
+
tok
|
260
|
+
end
|
261
|
+
|
262
|
+
# on_error do |error_token_id, error_value, value_stack|
|
263
|
+
# puts "Parse error on token: #{error_token_id}, value: #{error_value}"
|
264
|
+
# end
|
265
|
+
|
266
|
+
def parse
|
267
|
+
puts "parse:"
|
268
|
+
@tree = []
|
269
|
+
do_parse
|
270
|
+
@tree
|
271
|
+
end
|
272
|
+
|
273
|
+
|
274
|
+
def on_error(*args)
|
275
|
+
puts "!! on error:"
|
276
|
+
puts "args=#{args.pretty_inspect}"
|
277
|
+
end
|
278
|
+
|
279
|
+
=begin
|
280
|
+
on_error do |error_token_id, error_value, value_stack|
|
281
|
+
puts "Parse error on token: #{error_token_id}, value: #{error_value}"
|
282
|
+
end
|
283
|
+
=end
|
284
|
+
|
285
|
+
end
|
286
|
+
|
287
|
+
|
50
288
|
puts SportDb::Module::Parser.banner # say hello
|
51
289
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: racc
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rdoc
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|