sportdb-parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +14 -0
- data/README.md +8 -0
- data/Rakefile +27 -0
- data/bin/fbt +144 -0
- data/lib/sportdb/parser/lang.rb +111 -0
- data/lib/sportdb/parser/linter.rb +153 -0
- data/lib/sportdb/parser/outline_reader.rb +101 -0
- data/lib/sportdb/parser/parser.rb +196 -0
- data/lib/sportdb/parser/token-date.rb +193 -0
- data/lib/sportdb/parser/token-score.rb +121 -0
- data/lib/sportdb/parser/token-text.rb +114 -0
- data/lib/sportdb/parser/token.rb +364 -0
- data/lib/sportdb/parser.rb +44 -0
- metadata +96 -0
@@ -0,0 +1,364 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
class Parser
|
5
|
+
|
6
|
+
|
7
|
+
##
|
8
|
+
# keep 18h30 - why? why not?
|
9
|
+
# add support for 6:30pm 8:20am etc. - why? why not?
|
10
|
+
|
11
|
+
TIME_RE = %r{
|
12
|
+
## e.g. 18.30 (or 18:30 or 18h30)
|
13
|
+
(?<time> \b
|
14
|
+
(?<hour>\d{1,2})
|
15
|
+
(?: :|\.|h )
|
16
|
+
(?<minute>\d{2})
|
17
|
+
\b
|
18
|
+
)
|
19
|
+
}ix
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
##
|
24
|
+
# for timezone format use for now:
|
25
|
+
# (BRT/UTC-3) (e.g. brazil time)
|
26
|
+
#
|
27
|
+
# (CET/UTC+1) - central european time
|
28
|
+
# (CEST/UTC+2) - central european summer time - daylight saving time (DST).
|
29
|
+
# (EET/UTC+1) - eastern european time
|
30
|
+
# (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
|
31
|
+
#
|
32
|
+
# UTC+3
|
33
|
+
# UTC+4
|
34
|
+
# UTC+0
|
35
|
+
# UTC+00
|
36
|
+
# UTC+0000
|
37
|
+
#
|
38
|
+
# - allow +01 or +0100 - why? why not
|
39
|
+
# - +0130 (01:30)
|
40
|
+
#
|
41
|
+
# see
|
42
|
+
# https://en.wikipedia.org/wiki/Time_zone
|
43
|
+
# https://en.wikipedia.org/wiki/List_of_UTC_offsets
|
44
|
+
# https://en.wikipedia.org/wiki/UTC−04:00 etc.
|
45
|
+
|
46
|
+
TIMEZONE_RE = %r{
|
47
|
+
## e.g. (UTC-2) or (CEST/UTC-2) etc.
|
48
|
+
(?<timezone>
|
49
|
+
\(
|
50
|
+
## optional "local" timezone name eg. BRT or CEST etc.
|
51
|
+
(?: [a-z]+
|
52
|
+
/
|
53
|
+
)?
|
54
|
+
[a-z]+
|
55
|
+
[+-]
|
56
|
+
\d{1,4} ## e.g. 0 or 00 or 0000
|
57
|
+
\)
|
58
|
+
)
|
59
|
+
}ix
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
BASICS_RE = %r{
|
65
|
+
## e.g. (51) or (1) etc. - limit digits of number???
|
66
|
+
(?<num> \( (?<value>\d+) \) )
|
67
|
+
|
|
68
|
+
(?<vs>
|
69
|
+
(?<=[ ]) # Positive lookbehind for space
|
70
|
+
(?:
|
71
|
+
vs\.?| ## allow optional dot (eg. vs. v.)
|
72
|
+
v\.?|
|
73
|
+
-
|
74
|
+
) # not bigger match first e.g. vs than v etc.
|
75
|
+
(?=[ ]) # positive lookahead for space
|
76
|
+
)
|
77
|
+
|
|
78
|
+
(?<none>
|
79
|
+
(?<=[ \[]|^) # Positive lookbehind for space or [
|
80
|
+
-
|
81
|
+
(?=[ ]*;) # positive lookahead for space
|
82
|
+
)
|
83
|
+
|
|
84
|
+
(?<spaces> [ ]{2,}) |
|
85
|
+
(?<space> [ ])
|
86
|
+
|
|
87
|
+
(?<sym>[;,@|\[\]])
|
88
|
+
}ix
|
89
|
+
|
90
|
+
|
91
|
+
MINUTE_RE = %r{
|
92
|
+
(?<minute>
|
93
|
+
(?<=[ ]) # Positive lookbehind for space required
|
94
|
+
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
95
|
+
(?: \+
|
96
|
+
(?<value2>\d{1,3})
|
97
|
+
)?
|
98
|
+
' ## must have minute marker!!!!
|
99
|
+
)
|
100
|
+
}ix
|
101
|
+
|
102
|
+
|
103
|
+
## (match) status
|
104
|
+
## note: english usage - cancelled (in UK), canceled (in US)
|
105
|
+
##
|
106
|
+
## add more variants - why? why not?
|
107
|
+
|
108
|
+
STATUS_RE = %r{
|
109
|
+
(?<status>
|
110
|
+
\b
|
111
|
+
(?:
|
112
|
+
cancelled|canceled|can\.
|
113
|
+
|
|
114
|
+
abandoned|abd\.
|
115
|
+
|
|
116
|
+
postponed
|
117
|
+
|
|
118
|
+
awarded|awd\.
|
119
|
+
|
|
120
|
+
replay
|
121
|
+
)
|
122
|
+
(?=[ \]]|$)
|
123
|
+
)}ix
|
124
|
+
|
125
|
+
## todo/check: remove loakahead assertion here - why require space?
|
126
|
+
## note: \b works only after non-alphanum
|
127
|
+
## to make it work with awd. (dot) "custom" lookahead neeeded
|
128
|
+
|
129
|
+
|
130
|
+
## goal types
|
131
|
+
# (pen.) or (pen) or (p.) or (p)
|
132
|
+
## (o.g.) or (og)
|
133
|
+
GOAL_PEN_RE = %r{
|
134
|
+
(?<pen> \(
|
135
|
+
(?:pen|p)\.?
|
136
|
+
\)
|
137
|
+
)
|
138
|
+
}ix
|
139
|
+
GOAL_OG_RE = %r{
|
140
|
+
(?<og> \(
|
141
|
+
(?:og|o\.g\.)
|
142
|
+
\)
|
143
|
+
)
|
144
|
+
}ix
|
145
|
+
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
RE = Regexp.union( STATUS_RE,
|
150
|
+
TIMEZONE_RE,
|
151
|
+
TIME_RE,
|
152
|
+
DURATION_RE, # note - duration MUST match before date
|
153
|
+
DATE_RE,
|
154
|
+
SCORE_RE,
|
155
|
+
BASICS_RE, MINUTE_RE,
|
156
|
+
GOAL_OG_RE, GOAL_PEN_RE,
|
157
|
+
TEXT_RE )
|
158
|
+
|
159
|
+
|
160
|
+
def log( msg )
|
161
|
+
## append msg to ./logs.txt
|
162
|
+
## use ./errors.txt - why? why not?
|
163
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
164
|
+
f.write( msg )
|
165
|
+
f.write( "\n" )
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
|
171
|
+
def tokenize_with_errors( line, typed: false,
|
172
|
+
debug: false )
|
173
|
+
tokens = []
|
174
|
+
errors = [] ## keep a list of errors - why? why not?
|
175
|
+
|
176
|
+
puts ">#{line}<" if debug
|
177
|
+
|
178
|
+
pos = 0
|
179
|
+
## track last offsets - to report error on no match
|
180
|
+
## or no match in end of string
|
181
|
+
offsets = [0,0]
|
182
|
+
m = nil
|
183
|
+
|
184
|
+
while m = RE.match( line, pos )
|
185
|
+
if debug
|
186
|
+
pp m
|
187
|
+
puts "pos: #{pos}"
|
188
|
+
end
|
189
|
+
offsets = [m.begin(0), m.end(0)]
|
190
|
+
|
191
|
+
if offsets[0] != pos
|
192
|
+
## match NOT starting at start/begin position!!!
|
193
|
+
## report parse error!!!
|
194
|
+
msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
195
|
+
puts msg
|
196
|
+
|
197
|
+
errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
|
198
|
+
log( msg )
|
199
|
+
end
|
200
|
+
|
201
|
+
##
|
202
|
+
## todo/fix - also check if possible
|
203
|
+
## if no match but not yet end off string!!!!
|
204
|
+
## report skipped text run too!!!
|
205
|
+
|
206
|
+
pos = offsets[1]
|
207
|
+
|
208
|
+
pp offsets if debug
|
209
|
+
|
210
|
+
t = if m[:space]
|
211
|
+
## skip space
|
212
|
+
nil
|
213
|
+
elsif m[:spaces]
|
214
|
+
## skip spaces
|
215
|
+
nil
|
216
|
+
elsif m[:text]
|
217
|
+
[:text, m[:text]] ## keep pos - why? why not?
|
218
|
+
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
219
|
+
[:status, m[:status]]
|
220
|
+
elsif m[:time]
|
221
|
+
if typed
|
222
|
+
## unify to iso-format
|
223
|
+
### 12.40 => 12:40
|
224
|
+
## 12h40 => 12:40 etc.
|
225
|
+
## keep string (no time-only type in ruby)
|
226
|
+
hour = m[:hour].to_i(10) ## allow 08/07/etc.
|
227
|
+
minute = m[:minute].to_i(10)
|
228
|
+
## check if valid - 0:00 - 24:00
|
229
|
+
## check if 24:00 possible? or only 0:00 (23:59)
|
230
|
+
if (hour >= 0 && hour <= 24) &&
|
231
|
+
(minute >=0 && minute <= 59)
|
232
|
+
## note - for debugging keep (pass along) "literal" time
|
233
|
+
## might use/add support for am/pm later
|
234
|
+
[:time, m[:time], {h:hour,m:minute}]
|
235
|
+
else
|
236
|
+
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
237
|
+
end
|
238
|
+
else
|
239
|
+
[:time, m[:time]]
|
240
|
+
end
|
241
|
+
elsif m[:date]
|
242
|
+
if typed
|
243
|
+
date = {}
|
244
|
+
=begin
|
245
|
+
((?<day_name>#{DAY_NAMES})
|
246
|
+
[ ]
|
247
|
+
)?
|
248
|
+
(?<month_name>#{MONTH_NAMES})
|
249
|
+
(?: \/|[ ] )
|
250
|
+
(?<day>\d{1,2})
|
251
|
+
## optional year
|
252
|
+
( [ ]
|
253
|
+
(?<year>\d{4})
|
254
|
+
)?
|
255
|
+
=end
|
256
|
+
## map month names
|
257
|
+
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
258
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
259
|
+
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
260
|
+
date[:d] = m[:day].to_i(10) if m[:day]
|
261
|
+
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
262
|
+
## note - for debugging keep (pass along) "literal" date
|
263
|
+
[:date, m[:date], date]
|
264
|
+
else
|
265
|
+
[:date, m[:date]]
|
266
|
+
end
|
267
|
+
elsif m[:timezone]
|
268
|
+
[:timezone, m[:timezone]]
|
269
|
+
elsif m[:duration]
|
270
|
+
[:duration, m[:duration]]
|
271
|
+
elsif m[:num]
|
272
|
+
if typed
|
273
|
+
## note - strip enclosing () and convert to integer
|
274
|
+
[:num, m[:value].to_i(10)]
|
275
|
+
else
|
276
|
+
[:num, m[:num]]
|
277
|
+
end
|
278
|
+
elsif m[:score]
|
279
|
+
if typed
|
280
|
+
score = {}
|
281
|
+
## check for pen
|
282
|
+
score[:p] = [m[:p1].to_i(10),
|
283
|
+
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
284
|
+
score[:et] = [m[:et1].to_i(10),
|
285
|
+
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
286
|
+
score[:ft] = [m[:ft1].to_i(10),
|
287
|
+
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
288
|
+
score[:ht] = [m[:ht1].to_i(10),
|
289
|
+
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
290
|
+
|
291
|
+
## note - for debugging keep (pass along) "literal" score
|
292
|
+
[:score, m[:score], score]
|
293
|
+
else
|
294
|
+
[:score, m[:score]]
|
295
|
+
end
|
296
|
+
elsif m[:minute]
|
297
|
+
if typed
|
298
|
+
minute = {}
|
299
|
+
minute[:m] = m[:value].to_i(10)
|
300
|
+
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
301
|
+
## note - for debugging keep (pass along) "literal" minute
|
302
|
+
[:minute, m[:minute], minute]
|
303
|
+
else
|
304
|
+
[:minute, m[:minute]]
|
305
|
+
end
|
306
|
+
elsif m[:og]
|
307
|
+
typed ? [:og] : [:og, m[:og]] ## for typed drop - string version/variants
|
308
|
+
elsif m[:pen]
|
309
|
+
typed ? [:pen] : [:pen, m[:pen]]
|
310
|
+
elsif m[:vs]
|
311
|
+
typed ? [:vs] : [:vs, m[:vs]]
|
312
|
+
elsif m[:none]
|
313
|
+
typed ? [:none] : [:none, m[:none]]
|
314
|
+
elsif m[:sym]
|
315
|
+
sym = m[:sym]
|
316
|
+
## return symbols "inline" as is - why? why not?
|
317
|
+
case sym
|
318
|
+
when ',' then [:',']
|
319
|
+
when ';' then [:';']
|
320
|
+
when '@' then [:'@']
|
321
|
+
when '|' then [:'|']
|
322
|
+
else
|
323
|
+
nil ## ignore others (e.g. brackets [])
|
324
|
+
end
|
325
|
+
else
|
326
|
+
## report error
|
327
|
+
nil
|
328
|
+
end
|
329
|
+
|
330
|
+
tokens << t if t
|
331
|
+
|
332
|
+
if debug
|
333
|
+
print ">"
|
334
|
+
print "*" * pos
|
335
|
+
puts "#{line[pos..-1]}<"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
## check if no match in end of string
|
340
|
+
if offsets[1] != line.size
|
341
|
+
msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
342
|
+
puts msg
|
343
|
+
log( msg )
|
344
|
+
|
345
|
+
errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
|
346
|
+
end
|
347
|
+
|
348
|
+
|
349
|
+
[tokens,errors]
|
350
|
+
end
|
351
|
+
|
352
|
+
|
353
|
+
### convience helper - ignore errors by default
|
354
|
+
def tokenize( line, typed: false,
|
355
|
+
debug: false )
|
356
|
+
tokens, _ = tokenize_with_errors( line, typed: typed,
|
357
|
+
debug: debug )
|
358
|
+
tokens
|
359
|
+
end
|
360
|
+
|
361
|
+
|
362
|
+
end # class Parser
|
363
|
+
end # module SportDb
|
364
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
####
|
4
|
+
# try a (simple) tokenizer/parser with regex
|
5
|
+
|
6
|
+
## note - match line-by-line
|
7
|
+
# avoid massive backtracking by definition
|
8
|
+
# that is, making it impossible
|
9
|
+
|
10
|
+
## sym(bols) -
|
11
|
+
## text - change text to name - why? why not?
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
require_relative 'parser/token-score'
|
16
|
+
require_relative 'parser/token-date'
|
17
|
+
require_relative 'parser/token-text'
|
18
|
+
require_relative 'parser/token'
|
19
|
+
require_relative 'parser/lang'
|
20
|
+
require_relative 'parser/parser'
|
21
|
+
|
22
|
+
|
23
|
+
## more
|
24
|
+
require_relative 'parser/outline_reader'
|
25
|
+
require_relative 'parser/linter'
|
26
|
+
|
27
|
+
|
28
|
+
###
|
29
|
+
# make parser api (easily) available - why? why not?
|
30
|
+
|
31
|
+
=begin
|
32
|
+
module SportDb
|
33
|
+
def self.parser() @@parser ||= Parser.new; end
|
34
|
+
def self.parse( ... )
|
35
|
+
end
|
36
|
+
def self.tokenize( ... )
|
37
|
+
end
|
38
|
+
end # module SportDb
|
39
|
+
=end
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sportdb-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gerald Bauer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-07-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rdoc
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '4.0'
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '7'
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '4.0'
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '7'
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: hoe
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '4.1'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '4.1'
|
47
|
+
description: sportdb-parser - football.txt match parser (& tokenizer)
|
48
|
+
email: gerald.bauer@gmail.com
|
49
|
+
executables:
|
50
|
+
- fbt
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files:
|
53
|
+
- CHANGELOG.md
|
54
|
+
- Manifest.txt
|
55
|
+
- README.md
|
56
|
+
files:
|
57
|
+
- CHANGELOG.md
|
58
|
+
- Manifest.txt
|
59
|
+
- README.md
|
60
|
+
- Rakefile
|
61
|
+
- bin/fbt
|
62
|
+
- lib/sportdb/parser.rb
|
63
|
+
- lib/sportdb/parser/lang.rb
|
64
|
+
- lib/sportdb/parser/linter.rb
|
65
|
+
- lib/sportdb/parser/outline_reader.rb
|
66
|
+
- lib/sportdb/parser/parser.rb
|
67
|
+
- lib/sportdb/parser/token-date.rb
|
68
|
+
- lib/sportdb/parser/token-score.rb
|
69
|
+
- lib/sportdb/parser/token-text.rb
|
70
|
+
- lib/sportdb/parser/token.rb
|
71
|
+
homepage: https://github.com/sportdb/sport.db
|
72
|
+
licenses:
|
73
|
+
- Public Domain
|
74
|
+
metadata: {}
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- "--main"
|
78
|
+
- README.md
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 2.2.2
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
requirements: []
|
92
|
+
rubygems_version: 3.4.10
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: sportdb-parser - football.txt match parser (& tokenizer)
|
96
|
+
test_files: []
|