sportdb-parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +14 -0
- data/README.md +8 -0
- data/Rakefile +27 -0
- data/bin/fbt +144 -0
- data/lib/sportdb/parser/lang.rb +111 -0
- data/lib/sportdb/parser/linter.rb +153 -0
- data/lib/sportdb/parser/outline_reader.rb +101 -0
- data/lib/sportdb/parser/parser.rb +196 -0
- data/lib/sportdb/parser/token-date.rb +193 -0
- data/lib/sportdb/parser/token-score.rb +121 -0
- data/lib/sportdb/parser/token-text.rb +114 -0
- data/lib/sportdb/parser/token.rb +364 -0
- data/lib/sportdb/parser.rb +44 -0
- metadata +96 -0
@@ -0,0 +1,364 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
class Parser
|
5
|
+
|
6
|
+
|
7
|
+
##
|
8
|
+
# keep 18h30 - why? why not?
|
9
|
+
# add support for 6:30pm 8:20am etc. - why? why not?
|
10
|
+
|
11
|
+
TIME_RE = %r{
|
12
|
+
## e.g. 18.30 (or 18:30 or 18h30)
|
13
|
+
(?<time> \b
|
14
|
+
(?<hour>\d{1,2})
|
15
|
+
(?: :|\.|h )
|
16
|
+
(?<minute>\d{2})
|
17
|
+
\b
|
18
|
+
)
|
19
|
+
}ix
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
##
|
24
|
+
# for timezone format use for now:
|
25
|
+
# (BRT/UTC-3) (e.g. brazil time)
|
26
|
+
#
|
27
|
+
# (CET/UTC+1) - central european time
|
28
|
+
# (CEST/UTC+2) - central european summer time - daylight saving time (DST).
|
29
|
+
# (EET/UTC+1) - eastern european time
|
30
|
+
# (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
|
31
|
+
#
|
32
|
+
# UTC+3
|
33
|
+
# UTC+4
|
34
|
+
# UTC+0
|
35
|
+
# UTC+00
|
36
|
+
# UTC+0000
|
37
|
+
#
|
38
|
+
# - allow +01 or +0100 - why? why not
|
39
|
+
# - +0130 (01:30)
|
40
|
+
#
|
41
|
+
# see
|
42
|
+
# https://en.wikipedia.org/wiki/Time_zone
|
43
|
+
# https://en.wikipedia.org/wiki/List_of_UTC_offsets
|
44
|
+
# https://en.wikipedia.org/wiki/UTC−04:00 etc.
|
45
|
+
|
46
|
+
TIMEZONE_RE = %r{
|
47
|
+
## e.g. (UTC-2) or (CEST/UTC-2) etc.
|
48
|
+
(?<timezone>
|
49
|
+
\(
|
50
|
+
## optional "local" timezone name eg. BRT or CEST etc.
|
51
|
+
(?: [a-z]+
|
52
|
+
/
|
53
|
+
)?
|
54
|
+
[a-z]+
|
55
|
+
[+-]
|
56
|
+
\d{1,4} ## e.g. 0 or 00 or 0000
|
57
|
+
\)
|
58
|
+
)
|
59
|
+
}ix
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
BASICS_RE = %r{
|
65
|
+
## e.g. (51) or (1) etc. - limit digits of number???
|
66
|
+
(?<num> \( (?<value>\d+) \) )
|
67
|
+
|
|
68
|
+
(?<vs>
|
69
|
+
(?<=[ ]) # Positive lookbehind for space
|
70
|
+
(?:
|
71
|
+
vs\.?| ## allow optional dot (eg. vs. v.)
|
72
|
+
v\.?|
|
73
|
+
-
|
74
|
+
) # not bigger match first e.g. vs than v etc.
|
75
|
+
(?=[ ]) # positive lookahead for space
|
76
|
+
)
|
77
|
+
|
|
78
|
+
(?<none>
|
79
|
+
(?<=[ \[]|^) # Positive lookbehind for space or [
|
80
|
+
-
|
81
|
+
(?=[ ]*;) # positive lookahead for space
|
82
|
+
)
|
83
|
+
|
|
84
|
+
(?<spaces> [ ]{2,}) |
|
85
|
+
(?<space> [ ])
|
86
|
+
|
|
87
|
+
(?<sym>[;,@|\[\]])
|
88
|
+
}ix
|
89
|
+
|
90
|
+
|
91
|
+
MINUTE_RE = %r{
|
92
|
+
(?<minute>
|
93
|
+
(?<=[ ]) # Positive lookbehind for space required
|
94
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
95
|
+
(?: \+
|
96
|
+
(?<value2>\d{1,3})
|
97
|
+
)?
|
98
|
+
' ## must have minute marker!!!!
|
99
|
+
)
|
100
|
+
}ix
|
101
|
+
|
102
|
+
|
103
|
+
## (match) status
|
104
|
+
## note: english usage - cancelled (in UK), canceled (in US)
|
105
|
+
##
|
106
|
+
## add more variants - why? why not?
|
107
|
+
|
108
|
+
STATUS_RE = %r{
|
109
|
+
(?<status>
|
110
|
+
\b
|
111
|
+
(?:
|
112
|
+
cancelled|canceled|can\.
|
113
|
+
|
|
114
|
+
abandoned|abd\.
|
115
|
+
|
|
116
|
+
postponed
|
117
|
+
|
|
118
|
+
awarded|awd\.
|
119
|
+
|
|
120
|
+
replay
|
121
|
+
)
|
122
|
+
(?=[ \]]|$)
|
123
|
+
)}ix
|
124
|
+
|
125
|
+
## todo/check: remove loakahead assertion here - why require space?
|
126
|
+
## note: \b works only after non-alphanum
|
127
|
+
## to make it work with awd. (dot) "custom" lookahead neeeded
|
128
|
+
|
129
|
+
|
130
|
+
## goal types
|
131
|
+
# (pen.) or (pen) or (p.) or (p)
|
132
|
+
## (o.g.) or (og)
|
133
|
+
GOAL_PEN_RE = %r{
|
134
|
+
(?<pen> \(
|
135
|
+
(?:pen|p)\.?
|
136
|
+
\)
|
137
|
+
)
|
138
|
+
}ix
|
139
|
+
GOAL_OG_RE = %r{
|
140
|
+
(?<og> \(
|
141
|
+
(?:og|o\.g\.)
|
142
|
+
\)
|
143
|
+
)
|
144
|
+
}ix
|
145
|
+
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
RE = Regexp.union( STATUS_RE,
|
150
|
+
TIMEZONE_RE,
|
151
|
+
TIME_RE,
|
152
|
+
DURATION_RE, # note - duration MUST match before date
|
153
|
+
DATE_RE,
|
154
|
+
SCORE_RE,
|
155
|
+
BASICS_RE, MINUTE_RE,
|
156
|
+
GOAL_OG_RE, GOAL_PEN_RE,
|
157
|
+
TEXT_RE )
|
158
|
+
|
159
|
+
|
160
|
+
def log( msg )
|
161
|
+
## append msg to ./logs.txt
|
162
|
+
## use ./errors.txt - why? why not?
|
163
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
164
|
+
f.write( msg )
|
165
|
+
f.write( "\n" )
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
|
171
|
+
def tokenize_with_errors( line, typed: false,
|
172
|
+
debug: false )
|
173
|
+
tokens = []
|
174
|
+
errors = [] ## keep a list of errors - why? why not?
|
175
|
+
|
176
|
+
puts ">#{line}<" if debug
|
177
|
+
|
178
|
+
pos = 0
|
179
|
+
## track last offsets - to report error on no match
|
180
|
+
## or no match in end of string
|
181
|
+
offsets = [0,0]
|
182
|
+
m = nil
|
183
|
+
|
184
|
+
while m = RE.match( line, pos )
|
185
|
+
if debug
|
186
|
+
pp m
|
187
|
+
puts "pos: #{pos}"
|
188
|
+
end
|
189
|
+
offsets = [m.begin(0), m.end(0)]
|
190
|
+
|
191
|
+
if offsets[0] != pos
|
192
|
+
## match NOT starting at start/begin position!!!
|
193
|
+
## report parse error!!!
|
194
|
+
msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
195
|
+
puts msg
|
196
|
+
|
197
|
+
errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
|
198
|
+
log( msg )
|
199
|
+
end
|
200
|
+
|
201
|
+
##
|
202
|
+
## todo/fix - also check if possible
|
203
|
+
## if no match but not yet end off string!!!!
|
204
|
+
## report skipped text run too!!!
|
205
|
+
|
206
|
+
pos = offsets[1]
|
207
|
+
|
208
|
+
pp offsets if debug
|
209
|
+
|
210
|
+
t = if m[:space]
|
211
|
+
## skip space
|
212
|
+
nil
|
213
|
+
elsif m[:spaces]
|
214
|
+
## skip spaces
|
215
|
+
nil
|
216
|
+
elsif m[:text]
|
217
|
+
[:text, m[:text]] ## keep pos - why? why not?
|
218
|
+
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
219
|
+
[:status, m[:status]]
|
220
|
+
elsif m[:time]
|
221
|
+
if typed
|
222
|
+
## unify to iso-format
|
223
|
+
### 12.40 => 12:40
|
224
|
+
## 12h40 => 12:40 etc.
|
225
|
+
## keep string (no time-only type in ruby)
|
226
|
+
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
227
|
+
minute = m[:minute].to_i(10)
|
228
|
+
## check if valid - 0:00 - 24:00
|
229
|
+
## check if 24:00 possible? or only 0:00 (23:59)
|
230
|
+
if (hour >= 0 && hour <= 24) &&
|
231
|
+
(minute >=0 && minute <= 59)
|
232
|
+
## note - for debugging keep (pass along) "literal" time
|
233
|
+
## might use/add support for am/pm later
|
234
|
+
[:time, m[:time], {h:hour,m:minute}]
|
235
|
+
else
|
236
|
+
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
237
|
+
end
|
238
|
+
else
|
239
|
+
[:time, m[:time]]
|
240
|
+
end
|
241
|
+
elsif m[:date]
|
242
|
+
if typed
|
243
|
+
date = {}
|
244
|
+
=begin
|
245
|
+
((?<day_name>#{DAY_NAMES})
|
246
|
+
[ ]
|
247
|
+
)?
|
248
|
+
(?<month_name>#{MONTH_NAMES})
|
249
|
+
(?: \/|[ ] )
|
250
|
+
(?<day>\d{1,2})
|
251
|
+
## optional year
|
252
|
+
( [ ]
|
253
|
+
(?<year>\d{4})
|
254
|
+
)?
|
255
|
+
=end
|
256
|
+
## map month names
|
257
|
+
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
258
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
259
|
+
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
260
|
+
date[:d] = m[:day].to_i(10) if m[:day]
|
261
|
+
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
262
|
+
## note - for debugging keep (pass along) "literal" date
|
263
|
+
[:date, m[:date], date]
|
264
|
+
else
|
265
|
+
[:date, m[:date]]
|
266
|
+
end
|
267
|
+
elsif m[:timezone]
|
268
|
+
[:timezone, m[:timezone]]
|
269
|
+
elsif m[:duration]
|
270
|
+
[:duration, m[:duration]]
|
271
|
+
elsif m[:num]
|
272
|
+
if typed
|
273
|
+
## note - strip enclosing () and convert to integer
|
274
|
+
[:num, m[:value].to_i(10)]
|
275
|
+
else
|
276
|
+
[:num, m[:num]]
|
277
|
+
end
|
278
|
+
elsif m[:score]
|
279
|
+
if typed
|
280
|
+
score = {}
|
281
|
+
## check for pen
|
282
|
+
score[:p] = [m[:p1].to_i(10),
|
283
|
+
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
284
|
+
score[:et] = [m[:et1].to_i(10),
|
285
|
+
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
286
|
+
score[:ft] = [m[:ft1].to_i(10),
|
287
|
+
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
288
|
+
score[:ht] = [m[:ht1].to_i(10),
|
289
|
+
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
290
|
+
|
291
|
+
## note - for debugging keep (pass along) "literal" score
|
292
|
+
[:score, m[:score], score]
|
293
|
+
else
|
294
|
+
[:score, m[:score]]
|
295
|
+
end
|
296
|
+
elsif m[:minute]
|
297
|
+
if typed
|
298
|
+
minute = {}
|
299
|
+
minute[:m] = m[:value].to_i(10)
|
300
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
301
|
+
## note - for debugging keep (pass along) "literal" minute
|
302
|
+
[:minute, m[:minute], minute]
|
303
|
+
else
|
304
|
+
[:minute, m[:minute]]
|
305
|
+
end
|
306
|
+
elsif m[:og]
|
307
|
+
typed ? [:og] : [:og, m[:og]] ## for typed drop - string version/variants
|
308
|
+
elsif m[:pen]
|
309
|
+
typed ? [:pen] : [:pen, m[:pen]]
|
310
|
+
elsif m[:vs]
|
311
|
+
typed ? [:vs] : [:vs, m[:vs]]
|
312
|
+
elsif m[:none]
|
313
|
+
typed ? [:none] : [:none, m[:none]]
|
314
|
+
elsif m[:sym]
|
315
|
+
sym = m[:sym]
|
316
|
+
## return symbols "inline" as is - why? why not?
|
317
|
+
case sym
|
318
|
+
when ',' then [:',']
|
319
|
+
when ';' then [:';']
|
320
|
+
when '@' then [:'@']
|
321
|
+
when '|' then [:'|']
|
322
|
+
else
|
323
|
+
nil ## ignore others (e.g. brackets [])
|
324
|
+
end
|
325
|
+
else
|
326
|
+
## report error
|
327
|
+
nil
|
328
|
+
end
|
329
|
+
|
330
|
+
tokens << t if t
|
331
|
+
|
332
|
+
if debug
|
333
|
+
print ">"
|
334
|
+
print "*" * pos
|
335
|
+
puts "#{line[pos..-1]}<"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
## check if no match in end of string
|
340
|
+
if offsets[1] != line.size
|
341
|
+
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
342
|
+
puts msg
|
343
|
+
log( msg )
|
344
|
+
|
345
|
+
errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
|
346
|
+
end
|
347
|
+
|
348
|
+
|
349
|
+
[tokens,errors]
|
350
|
+
end
|
351
|
+
|
352
|
+
|
353
|
+
### convience helper - ignore errors by default
|
354
|
+
def tokenize( line, typed: false,
|
355
|
+
debug: false )
|
356
|
+
tokens, _ = tokenize_with_errors( line, typed: typed,
|
357
|
+
debug: debug )
|
358
|
+
tokens
|
359
|
+
end
|
360
|
+
|
361
|
+
|
362
|
+
end # class Parser
|
363
|
+
end # module SportDb
|
364
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
####
|
4
|
+
# try a (simple) tokenizer/parser with regex
|
5
|
+
|
6
|
+
## note - match line-by-line
|
7
|
+
# avoid massive backtracking by definition
|
8
|
+
# that is, making it impossible
|
9
|
+
|
10
|
+
## sym(bols) -
|
11
|
+
## text - change text to name - why? why not?
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
require_relative 'parser/token-score'
|
16
|
+
require_relative 'parser/token-date'
|
17
|
+
require_relative 'parser/token-text'
|
18
|
+
require_relative 'parser/token'
|
19
|
+
require_relative 'parser/lang'
|
20
|
+
require_relative 'parser/parser'
|
21
|
+
|
22
|
+
|
23
|
+
## more
|
24
|
+
require_relative 'parser/outline_reader'
|
25
|
+
require_relative 'parser/linter'
|
26
|
+
|
27
|
+
|
28
|
+
###
|
29
|
+
# make parser api (easily) available - why? why not?
|
30
|
+
|
31
|
+
=begin
|
32
|
+
module SportDb
|
33
|
+
def self.parser() @@parser ||= Parser.new; end
|
34
|
+
def self.parse( ... )
|
35
|
+
end
|
36
|
+
def self.tokenize( ... )
|
37
|
+
end
|
38
|
+
end # module SportDb
|
39
|
+
=end
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sportdb-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gerald Bauer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-07-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rdoc
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '4.0'
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '7'
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '4.0'
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '7'
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: hoe
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '4.1'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '4.1'
|
47
|
+
description: sportdb-parser - football.txt match parser (& tokenizer)
|
48
|
+
email: gerald.bauer@gmail.com
|
49
|
+
executables:
|
50
|
+
- fbt
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files:
|
53
|
+
- CHANGELOG.md
|
54
|
+
- Manifest.txt
|
55
|
+
- README.md
|
56
|
+
files:
|
57
|
+
- CHANGELOG.md
|
58
|
+
- Manifest.txt
|
59
|
+
- README.md
|
60
|
+
- Rakefile
|
61
|
+
- bin/fbt
|
62
|
+
- lib/sportdb/parser.rb
|
63
|
+
- lib/sportdb/parser/lang.rb
|
64
|
+
- lib/sportdb/parser/linter.rb
|
65
|
+
- lib/sportdb/parser/outline_reader.rb
|
66
|
+
- lib/sportdb/parser/parser.rb
|
67
|
+
- lib/sportdb/parser/token-date.rb
|
68
|
+
- lib/sportdb/parser/token-score.rb
|
69
|
+
- lib/sportdb/parser/token-text.rb
|
70
|
+
- lib/sportdb/parser/token.rb
|
71
|
+
homepage: https://github.com/sportdb/sport.db
|
72
|
+
licenses:
|
73
|
+
- Public Domain
|
74
|
+
metadata: {}
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- "--main"
|
78
|
+
- README.md
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 2.2.2
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
requirements: []
|
92
|
+
rubygems_version: 3.4.10
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: sportdb-parser - football.txt match parser (& tokenizer)
|
96
|
+
test_files: []
|