sportdb-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,364 @@
1
+
2
+
3
+ module SportDb
4
+ class Parser
5
+
6
+
7
+ ##
8
+ # keep 18h30 - why? why not?
9
+ # add support for 6:30pm 8:20am etc. - why? why not?
10
+
11
+ TIME_RE = %r{
12
+ ## e.g. 18.30 (or 18:30 or 18h30)
13
+ (?<time> \b
14
+ (?<hour>\d{1,2})
15
+ (?: :|\.|h )
16
+ (?<minute>\d{2})
17
+ \b
18
+ )
19
+ }ix
20
+
21
+
22
+
23
+ ##
24
+ # for timezone format use for now:
25
+ # (BRT/UTC-3) (e.g. brazil time)
26
+ #
27
+ # (CET/UTC+1) - central european time
28
+ # (CEST/UTC+2) - central european summer time - daylight saving time (DST).
29
+ # (EET/UTC+1) - eastern european time
30
+ # (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
31
+ #
32
+ # UTC+3
33
+ # UTC+4
34
+ # UTC+0
35
+ # UTC+00
36
+ # UTC+0000
37
+ #
38
+ # - allow +01 or +0100 - why? why not
39
+ # - +0130 (01:30)
40
+ #
41
+ # see
42
+ # https://en.wikipedia.org/wiki/Time_zone
43
+ # https://en.wikipedia.org/wiki/List_of_UTC_offsets
44
+ # https://en.wikipedia.org/wiki/UTC−04:00 etc.
45
+
46
+ TIMEZONE_RE = %r{
47
+ ## e.g. (UTC-2) or (CEST/UTC-2) etc.
48
+ (?<timezone>
49
+ \(
50
+ ## optional "local" timezone name eg. BRT or CEST etc.
51
+ (?: [a-z]+
52
+ /
53
+ )?
54
+ [a-z]+
55
+ [+-]
56
+ \d{1,4} ## e.g. 0 or 00 or 0000
57
+ \)
58
+ )
59
+ }ix
60
+
61
+
62
+
63
+
64
+ BASICS_RE = %r{
65
+ ## e.g. (51) or (1) etc. - limit digits of number???
66
+ (?<num> \( (?<value>\d+) \) )
67
+ |
68
+ (?<vs>
69
+ (?<=[ ]) # Positive lookbehind for space
70
+ (?:
71
+ vs\.?| ## allow optional dot (eg. vs. v.)
72
+ v\.?|
73
+ -
74
+ ) # not bigger match first e.g. vs than v etc.
75
+ (?=[ ]) # positive lookahead for space
76
+ )
77
+ |
78
+ (?<none>
79
+ (?<=[ \[]|^) # Positive lookbehind for space or [
80
+ -
81
+ (?=[ ]*;) # positive lookahead for space
82
+ )
83
+ |
84
+ (?<spaces> [ ]{2,}) |
85
+ (?<space> [ ])
86
+ |
87
+ (?<sym>[;,@|\[\]])
88
+ }ix
89
+
90
+
91
+ MINUTE_RE = %r{
92
+ (?<minute>
93
+ (?<=[ ]) # Positive lookbehind for space required
94
+ (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
95
+ (?: \+
96
+ (?<value2>\d{1,3})
97
+ )?
98
+ ' ## must have minute marker!!!!
99
+ )
100
+ }ix
101
+
102
+
103
+ ## (match) status
104
+ ## note: english usage - cancelled (in UK), canceled (in US)
105
+ ##
106
+ ## add more variants - why? why not?
107
+
108
+ STATUS_RE = %r{
109
+ (?<status>
110
+ \b
111
+ (?:
112
+ cancelled|canceled|can\.
113
+ |
114
+ abandoned|abd\.
115
+ |
116
+ postponed
117
+ |
118
+ awarded|awd\.
119
+ |
120
+ replay
121
+ )
122
+ (?=[ \]]|$)
123
+ )}ix
124
+
125
+ ## todo/check: remove loakahead assertion here - why require space?
126
+ ## note: \b works only after non-alphanum
127
+ ## to make it work with awd. (dot) "custom" lookahead neeeded
128
+
129
+
130
+ ## goal types
131
+ # (pen.) or (pen) or (p.) or (p)
132
+ ## (o.g.) or (og)
133
+ GOAL_PEN_RE = %r{
134
+ (?<pen> \(
135
+ (?:pen|p)\.?
136
+ \)
137
+ )
138
+ }ix
139
+ GOAL_OG_RE = %r{
140
+ (?<og> \(
141
+ (?:og|o\.g\.)
142
+ \)
143
+ )
144
+ }ix
145
+
146
+
147
+
148
+
149
+ RE = Regexp.union( STATUS_RE,
150
+ TIMEZONE_RE,
151
+ TIME_RE,
152
+ DURATION_RE, # note - duration MUST match before date
153
+ DATE_RE,
154
+ SCORE_RE,
155
+ BASICS_RE, MINUTE_RE,
156
+ GOAL_OG_RE, GOAL_PEN_RE,
157
+ TEXT_RE )
158
+
159
+
160
+ def log( msg )
161
+ ## append msg to ./logs.txt
162
+ ## use ./errors.txt - why? why not?
163
+ File.open( './logs.txt', 'a:utf-8' ) do |f|
164
+ f.write( msg )
165
+ f.write( "\n" )
166
+ end
167
+ end
168
+
169
+
170
+
171
+ def tokenize_with_errors( line, typed: false,
172
+ debug: false )
173
+ tokens = []
174
+ errors = [] ## keep a list of errors - why? why not?
175
+
176
+ puts ">#{line}<" if debug
177
+
178
+ pos = 0
179
+ ## track last offsets - to report error on no match
180
+ ## or no match in end of string
181
+ offsets = [0,0]
182
+ m = nil
183
+
184
+ while m = RE.match( line, pos )
185
+ if debug
186
+ pp m
187
+ puts "pos: #{pos}"
188
+ end
189
+ offsets = [m.begin(0), m.end(0)]
190
+
191
+ if offsets[0] != pos
192
+ ## match NOT starting at start/begin position!!!
193
+ ## report parse error!!!
194
+ msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
195
+ puts msg
196
+
197
+ errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
198
+ log( msg )
199
+ end
200
+
201
+ ##
202
+ ## todo/fix - also check if possible
203
+ ## if no match but not yet end off string!!!!
204
+ ## report skipped text run too!!!
205
+
206
+ pos = offsets[1]
207
+
208
+ pp offsets if debug
209
+
210
+ t = if m[:space]
211
+ ## skip space
212
+ nil
213
+ elsif m[:spaces]
214
+ ## skip spaces
215
+ nil
216
+ elsif m[:text]
217
+ [:text, m[:text]] ## keep pos - why? why not?
218
+ elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
219
+ [:status, m[:status]]
220
+ elsif m[:time]
221
+ if typed
222
+ ## unify to iso-format
223
+ ### 12.40 => 12:40
224
+ ## 12h40 => 12:40 etc.
225
+ ## keep string (no time-only type in ruby)
226
+ hour = m[:hour].to_i(10) ## allow 08/07/etc.
227
+ minute = m[:minute].to_i(10)
228
+ ## check if valid - 0:00 - 24:00
229
+ ## check if 24:00 possible? or only 0:00 (23:59)
230
+ if (hour >= 0 && hour <= 24) &&
231
+ (minute >=0 && minute <= 59)
232
+ ## note - for debugging keep (pass along) "literal" time
233
+ ## might use/add support for am/pm later
234
+ [:time, m[:time], {h:hour,m:minute}]
235
+ else
236
+ raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
237
+ end
238
+ else
239
+ [:time, m[:time]]
240
+ end
241
+ elsif m[:date]
242
+ if typed
243
+ date = {}
244
+ =begin
245
+ ((?<day_name>#{DAY_NAMES})
246
+ [ ]
247
+ )?
248
+ (?<month_name>#{MONTH_NAMES})
249
+ (?: \/|[ ] )
250
+ (?<day>\d{1,2})
251
+ ## optional year
252
+ ( [ ]
253
+ (?<year>\d{4})
254
+ )?
255
+ =end
256
+ ## map month names
257
+ ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
258
+ date[:y] = m[:year].to_i(10) if m[:year]
259
+ date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
260
+ date[:d] = m[:day].to_i(10) if m[:day]
261
+ date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
262
+ ## note - for debugging keep (pass along) "literal" date
263
+ [:date, m[:date], date]
264
+ else
265
+ [:date, m[:date]]
266
+ end
267
+ elsif m[:timezone]
268
+ [:timezone, m[:timezone]]
269
+ elsif m[:duration]
270
+ [:duration, m[:duration]]
271
+ elsif m[:num]
272
+ if typed
273
+ ## note - strip enclosing () and convert to integer
274
+ [:num, m[:value].to_i(10)]
275
+ else
276
+ [:num, m[:num]]
277
+ end
278
+ elsif m[:score]
279
+ if typed
280
+ score = {}
281
+ ## check for pen
282
+ score[:p] = [m[:p1].to_i(10),
283
+ m[:p2].to_i(10)] if m[:p1] && m[:p2]
284
+ score[:et] = [m[:et1].to_i(10),
285
+ m[:et2].to_i(10)] if m[:et1] && m[:et2]
286
+ score[:ft] = [m[:ft1].to_i(10),
287
+ m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
288
+ score[:ht] = [m[:ht1].to_i(10),
289
+ m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
290
+
291
+ ## note - for debugging keep (pass along) "literal" score
292
+ [:score, m[:score], score]
293
+ else
294
+ [:score, m[:score]]
295
+ end
296
+ elsif m[:minute]
297
+ if typed
298
+ minute = {}
299
+ minute[:m] = m[:value].to_i(10)
300
+ minute[:offset] = m[:value2].to_i(10) if m[:value2]
301
+ ## note - for debugging keep (pass along) "literal" minute
302
+ [:minute, m[:minute], minute]
303
+ else
304
+ [:minute, m[:minute]]
305
+ end
306
+ elsif m[:og]
307
+ typed ? [:og] : [:og, m[:og]] ## for typed drop - string version/variants
308
+ elsif m[:pen]
309
+ typed ? [:pen] : [:pen, m[:pen]]
310
+ elsif m[:vs]
311
+ typed ? [:vs] : [:vs, m[:vs]]
312
+ elsif m[:none]
313
+ typed ? [:none] : [:none, m[:none]]
314
+ elsif m[:sym]
315
+ sym = m[:sym]
316
+ ## return symbols "inline" as is - why? why not?
317
+ case sym
318
+ when ',' then [:',']
319
+ when ';' then [:';']
320
+ when '@' then [:'@']
321
+ when '|' then [:'|']
322
+ else
323
+ nil ## ignore others (e.g. brackets [])
324
+ end
325
+ else
326
+ ## report error
327
+ nil
328
+ end
329
+
330
+ tokens << t if t
331
+
332
+ if debug
333
+ print ">"
334
+ print "*" * pos
335
+ puts "#{line[pos..-1]}<"
336
+ end
337
+ end
338
+
339
+ ## check if no match in end of string
340
+ if offsets[1] != line.size
341
+ msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
342
+ puts msg
343
+ log( msg )
344
+
345
+ errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
346
+ end
347
+
348
+
349
+ [tokens,errors]
350
+ end
351
+
352
+
353
+ ### convience helper - ignore errors by default
354
+ def tokenize( line, typed: false,
355
+ debug: false )
356
+ tokens, _ = tokenize_with_errors( line, typed: typed,
357
+ debug: debug )
358
+ tokens
359
+ end
360
+
361
+
362
+ end # class Parser
363
+ end # module SportDb
364
+
@@ -0,0 +1,44 @@
1
+
2
+
3
+ ####
4
+ # try a (simple) tokenizer/parser with regex
5
+
6
+ ## note - match line-by-line
7
+ # avoid massive backtracking by definition
8
+ # that is, making it impossible
9
+
10
+ ## sym(bols) -
11
+ ## text - change text to name - why? why not?
12
+
13
+
14
+
15
+ require_relative 'parser/token-score'
16
+ require_relative 'parser/token-date'
17
+ require_relative 'parser/token-text'
18
+ require_relative 'parser/token'
19
+ require_relative 'parser/lang'
20
+ require_relative 'parser/parser'
21
+
22
+
23
+ ## more
24
+ require_relative 'parser/outline_reader'
25
+ require_relative 'parser/linter'
26
+
27
+
28
+ ###
29
+ # make parser api (easily) available - why? why not?
30
+
31
+ =begin
32
+ module SportDb
33
+ def self.parser() @@parser ||= Parser.new; end
34
+ def self.parse( ... )
35
+ end
36
+ def self.tokenize( ... )
37
+ end
38
+ end # module SportDb
39
+ =end
40
+
41
+
42
+
43
+
44
+
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sportdb-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gerald Bauer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-07-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rdoc
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '4.0'
20
+ - - "<"
21
+ - !ruby/object:Gem::Version
22
+ version: '7'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ version: '4.0'
30
+ - - "<"
31
+ - !ruby/object:Gem::Version
32
+ version: '7'
33
+ - !ruby/object:Gem::Dependency
34
+ name: hoe
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '4.1'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '4.1'
47
+ description: sportdb-parser - football.txt match parser (& tokenizer)
48
+ email: gerald.bauer@gmail.com
49
+ executables:
50
+ - fbt
51
+ extensions: []
52
+ extra_rdoc_files:
53
+ - CHANGELOG.md
54
+ - Manifest.txt
55
+ - README.md
56
+ files:
57
+ - CHANGELOG.md
58
+ - Manifest.txt
59
+ - README.md
60
+ - Rakefile
61
+ - bin/fbt
62
+ - lib/sportdb/parser.rb
63
+ - lib/sportdb/parser/lang.rb
64
+ - lib/sportdb/parser/linter.rb
65
+ - lib/sportdb/parser/outline_reader.rb
66
+ - lib/sportdb/parser/parser.rb
67
+ - lib/sportdb/parser/token-date.rb
68
+ - lib/sportdb/parser/token-score.rb
69
+ - lib/sportdb/parser/token-text.rb
70
+ - lib/sportdb/parser/token.rb
71
+ homepage: https://github.com/sportdb/sport.db
72
+ licenses:
73
+ - Public Domain
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options:
77
+ - "--main"
78
+ - README.md
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 2.2.2
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubygems_version: 3.4.10
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: sportdb-parser - football.txt match parser (& tokenizer)
96
+ test_files: []