sportdb-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,364 @@
1
+
2
+
3
+ module SportDb
4
+ class Parser
5
+
6
+
7
+ ##
8
+ # keep 18h30 - why? why not?
9
+ # add support for 6:30pm 8:20am etc. - why? why not?
10
+
11
+ TIME_RE = %r{
12
+ ## e.g. 18.30 (or 18:30 or 18h30)
13
+ (?<time> \b
14
+ (?<hour>\d{1,2})
15
+ (?: :|\.|h )
16
+ (?<minute>\d{2})
17
+ \b
18
+ )
19
+ }ix
20
+
21
+
22
+
23
+ ##
24
+ # for timezone format use for now:
25
+ # (BRT/UTC-3) (e.g. brazil time)
26
+ #
27
+ # (CET/UTC+1) - central european time
28
+ # (CEST/UTC+2) - central european summer time - daylight saving time (DST).
29
+ # (EET/UTC+1) - eastern european time
30
+ # (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
31
+ #
32
+ # UTC+3
33
+ # UTC+4
34
+ # UTC+0
35
+ # UTC+00
36
+ # UTC+0000
37
+ #
38
+ # - allow +01 or +0100 - why? why not
39
+ # - +0130 (01:30)
40
+ #
41
+ # see
42
+ # https://en.wikipedia.org/wiki/Time_zone
43
+ # https://en.wikipedia.org/wiki/List_of_UTC_offsets
44
+ # https://en.wikipedia.org/wiki/UTC−04:00 etc.
45
+
46
+ TIMEZONE_RE = %r{
47
+ ## e.g. (UTC-2) or (CEST/UTC-2) etc.
48
+ (?<timezone>
49
+ \(
50
+ ## optional "local" timezone name eg. BRT or CEST etc.
51
+ (?: [a-z]+
52
+ /
53
+ )?
54
+ [a-z]+
55
+ [+-]
56
+ \d{1,4} ## e.g. 0 or 00 or 0000
57
+ \)
58
+ )
59
+ }ix
60
+
61
+
62
+
63
+
64
+ BASICS_RE = %r{
65
+ ## e.g. (51) or (1) etc. - limit digits of number???
66
+ (?<num> \( (?<value>\d+) \) )
67
+ |
68
+ (?<vs>
69
+ (?<=[ ]) # Positive lookbehind for space
70
+ (?:
71
+ vs\.?| ## allow optional dot (eg. vs. v.)
72
+ v\.?|
73
+ -
74
+ ) # not bigger match first e.g. vs than v etc.
75
+ (?=[ ]) # positive lookahead for space
76
+ )
77
+ |
78
+ (?<none>
79
+ (?<=[ \[]|^) # Positive lookbehind for space or [
80
+ -
81
+ (?=[ ]*;) # positive lookahead for space
82
+ )
83
+ |
84
+ (?<spaces> [ ]{2,}) |
85
+ (?<space> [ ])
86
+ |
87
+ (?<sym>[;,@|\[\]])
88
+ }ix
89
+
90
+
91
+ MINUTE_RE = %r{
92
+ (?<minute>
93
+ (?<=[ ]) # Positive lookbehind for space required
94
+ (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
95
+ (?: \+
96
+ (?<value2>\d{1,3})
97
+ )?
98
+ ' ## must have minute marker!!!!
99
+ )
100
+ }ix
101
+
102
+
103
+ ## (match) status
104
+ ## note: english usage - cancelled (in UK), canceled (in US)
105
+ ##
106
+ ## add more variants - why? why not?
107
+
108
+ STATUS_RE = %r{
109
+ (?<status>
110
+ \b
111
+ (?:
112
+ cancelled|canceled|can\.
113
+ |
114
+ abandoned|abd\.
115
+ |
116
+ postponed
117
+ |
118
+ awarded|awd\.
119
+ |
120
+ replay
121
+ )
122
+ (?=[ \]]|$)
123
+ )}ix
124
+
125
+ ## todo/check: remove loakahead assertion here - why require space?
126
+ ## note: \b works only after non-alphanum
127
+ ## to make it work with awd. (dot) "custom" lookahead neeeded
128
+
129
+
130
+ ## goal types
131
+ # (pen.) or (pen) or (p.) or (p)
132
+ ## (o.g.) or (og)
133
+ GOAL_PEN_RE = %r{
134
+ (?<pen> \(
135
+ (?:pen|p)\.?
136
+ \)
137
+ )
138
+ }ix
139
+ GOAL_OG_RE = %r{
140
+ (?<og> \(
141
+ (?:og|o\.g\.)
142
+ \)
143
+ )
144
+ }ix
145
+
146
+
147
+
148
+
149
+ RE = Regexp.union( STATUS_RE,
150
+ TIMEZONE_RE,
151
+ TIME_RE,
152
+ DURATION_RE, # note - duration MUST match before date
153
+ DATE_RE,
154
+ SCORE_RE,
155
+ BASICS_RE, MINUTE_RE,
156
+ GOAL_OG_RE, GOAL_PEN_RE,
157
+ TEXT_RE )
158
+
159
+
160
+ def log( msg )
161
+ ## append msg to ./logs.txt
162
+ ## use ./errors.txt - why? why not?
163
+ File.open( './logs.txt', 'a:utf-8' ) do |f|
164
+ f.write( msg )
165
+ f.write( "\n" )
166
+ end
167
+ end
168
+
169
+
170
+
171
+ def tokenize_with_errors( line, typed: false,
172
+ debug: false )
173
+ tokens = []
174
+ errors = [] ## keep a list of errors - why? why not?
175
+
176
+ puts ">#{line}<" if debug
177
+
178
+ pos = 0
179
+ ## track last offsets - to report error on no match
180
+ ## or no match in end of string
181
+ offsets = [0,0]
182
+ m = nil
183
+
184
+ while m = RE.match( line, pos )
185
+ if debug
186
+ pp m
187
+ puts "pos: #{pos}"
188
+ end
189
+ offsets = [m.begin(0), m.end(0)]
190
+
191
+ if offsets[0] != pos
192
+ ## match NOT starting at start/begin position!!!
193
+ ## report parse error!!!
194
+ msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
195
+ puts msg
196
+
197
+ errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
198
+ log( msg )
199
+ end
200
+
201
+ ##
202
+ ## todo/fix - also check if possible
203
+ ## if no match but not yet end off string!!!!
204
+ ## report skipped text run too!!!
205
+
206
+ pos = offsets[1]
207
+
208
+ pp offsets if debug
209
+
210
+ t = if m[:space]
211
+ ## skip space
212
+ nil
213
+ elsif m[:spaces]
214
+ ## skip spaces
215
+ nil
216
+ elsif m[:text]
217
+ [:text, m[:text]] ## keep pos - why? why not?
218
+ elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
219
+ [:status, m[:status]]
220
+ elsif m[:time]
221
+ if typed
222
+ ## unify to iso-format
223
+ ### 12.40 => 12:40
224
+ ## 12h40 => 12:40 etc.
225
+ ## keep string (no time-only type in ruby)
226
+ hour = m[:hour].to_i(10) ## allow 08/07/etc.
227
+ minute = m[:minute].to_i(10)
228
+ ## check if valid - 0:00 - 24:00
229
+ ## check if 24:00 possible? or only 0:00 (23:59)
230
+ if (hour >= 0 && hour <= 24) &&
231
+ (minute >=0 && minute <= 59)
232
+ ## note - for debugging keep (pass along) "literal" time
233
+ ## might use/add support for am/pm later
234
+ [:time, m[:time], {h:hour,m:minute}]
235
+ else
236
+ raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
237
+ end
238
+ else
239
+ [:time, m[:time]]
240
+ end
241
+ elsif m[:date]
242
+ if typed
243
+ date = {}
244
+ =begin
245
+ ((?<day_name>#{DAY_NAMES})
246
+ [ ]
247
+ )?
248
+ (?<month_name>#{MONTH_NAMES})
249
+ (?: \/|[ ] )
250
+ (?<day>\d{1,2})
251
+ ## optional year
252
+ ( [ ]
253
+ (?<year>\d{4})
254
+ )?
255
+ =end
256
+ ## map month names
257
+ ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
258
+ date[:y] = m[:year].to_i(10) if m[:year]
259
+ date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
260
+ date[:d] = m[:day].to_i(10) if m[:day]
261
+ date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
262
+ ## note - for debugging keep (pass along) "literal" date
263
+ [:date, m[:date], date]
264
+ else
265
+ [:date, m[:date]]
266
+ end
267
+ elsif m[:timezone]
268
+ [:timezone, m[:timezone]]
269
+ elsif m[:duration]
270
+ [:duration, m[:duration]]
271
+ elsif m[:num]
272
+ if typed
273
+ ## note - strip enclosing () and convert to integer
274
+ [:num, m[:value].to_i(10)]
275
+ else
276
+ [:num, m[:num]]
277
+ end
278
+ elsif m[:score]
279
+ if typed
280
+ score = {}
281
+ ## check for pen
282
+ score[:p] = [m[:p1].to_i(10),
283
+ m[:p2].to_i(10)] if m[:p1] && m[:p2]
284
+ score[:et] = [m[:et1].to_i(10),
285
+ m[:et2].to_i(10)] if m[:et1] && m[:et2]
286
+ score[:ft] = [m[:ft1].to_i(10),
287
+ m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
288
+ score[:ht] = [m[:ht1].to_i(10),
289
+ m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
290
+
291
+ ## note - for debugging keep (pass along) "literal" score
292
+ [:score, m[:score], score]
293
+ else
294
+ [:score, m[:score]]
295
+ end
296
+ elsif m[:minute]
297
+ if typed
298
+ minute = {}
299
+ minute[:m] = m[:value].to_i(10)
300
+ minute[:offset] = m[:value2].to_i(10) if m[:value2]
301
+ ## note - for debugging keep (pass along) "literal" minute
302
+ [:minute, m[:minute], minute]
303
+ else
304
+ [:minute, m[:minute]]
305
+ end
306
+ elsif m[:og]
307
+ typed ? [:og] : [:og, m[:og]] ## for typed drop - string version/variants
308
+ elsif m[:pen]
309
+ typed ? [:pen] : [:pen, m[:pen]]
310
+ elsif m[:vs]
311
+ typed ? [:vs] : [:vs, m[:vs]]
312
+ elsif m[:none]
313
+ typed ? [:none] : [:none, m[:none]]
314
+ elsif m[:sym]
315
+ sym = m[:sym]
316
+ ## return symbols "inline" as is - why? why not?
317
+ case sym
318
+ when ',' then [:',']
319
+ when ';' then [:';']
320
+ when '@' then [:'@']
321
+ when '|' then [:'|']
322
+ else
323
+ nil ## ignore others (e.g. brackets [])
324
+ end
325
+ else
326
+ ## report error
327
+ nil
328
+ end
329
+
330
+ tokens << t if t
331
+
332
+ if debug
333
+ print ">"
334
+ print "*" * pos
335
+ puts "#{line[pos..-1]}<"
336
+ end
337
+ end
338
+
339
+ ## check if no match in end of string
340
+ if offsets[1] != line.size
341
+ msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
342
+ puts msg
343
+ log( msg )
344
+
345
+ errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
346
+ end
347
+
348
+
349
+ [tokens,errors]
350
+ end
351
+
352
+
353
+ ### convience helper - ignore errors by default
354
+ def tokenize( line, typed: false,
355
+ debug: false )
356
+ tokens, _ = tokenize_with_errors( line, typed: typed,
357
+ debug: debug )
358
+ tokens
359
+ end
360
+
361
+
362
+ end # class Parser
363
+ end # module SportDb
364
+
@@ -0,0 +1,44 @@
1
+
2
+
3
+ ####
4
+ # try a (simple) tokenizer/parser with regex
5
+
6
+ ## note - match line-by-line
7
+ # avoid massive backtracking by definition
8
+ # that is, making it impossible
9
+
10
+ ## sym(bols) -
11
+ ## text - change text to name - why? why not?
12
+
13
+
14
+
15
+ require_relative 'parser/token-score'
16
+ require_relative 'parser/token-date'
17
+ require_relative 'parser/token-text'
18
+ require_relative 'parser/token'
19
+ require_relative 'parser/lang'
20
+ require_relative 'parser/parser'
21
+
22
+
23
+ ## more
24
+ require_relative 'parser/outline_reader'
25
+ require_relative 'parser/linter'
26
+
27
+
28
+ ###
29
+ # make parser api (easily) available - why? why not?
30
+
31
+ =begin
32
+ module SportDb
33
+ def self.parser() @@parser ||= Parser.new; end
34
+ def self.parse( ... )
35
+ end
36
+ def self.tokenize( ... )
37
+ end
38
+ end # module SportDb
39
+ =end
40
+
41
+
42
+
43
+
44
+
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sportdb-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gerald Bauer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-07-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rdoc
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '4.0'
20
+ - - "<"
21
+ - !ruby/object:Gem::Version
22
+ version: '7'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ version: '4.0'
30
+ - - "<"
31
+ - !ruby/object:Gem::Version
32
+ version: '7'
33
+ - !ruby/object:Gem::Dependency
34
+ name: hoe
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '4.1'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '4.1'
47
+ description: sportdb-parser - football.txt match parser (& tokenizer)
48
+ email: gerald.bauer@gmail.com
49
+ executables:
50
+ - fbt
51
+ extensions: []
52
+ extra_rdoc_files:
53
+ - CHANGELOG.md
54
+ - Manifest.txt
55
+ - README.md
56
+ files:
57
+ - CHANGELOG.md
58
+ - Manifest.txt
59
+ - README.md
60
+ - Rakefile
61
+ - bin/fbt
62
+ - lib/sportdb/parser.rb
63
+ - lib/sportdb/parser/lang.rb
64
+ - lib/sportdb/parser/linter.rb
65
+ - lib/sportdb/parser/outline_reader.rb
66
+ - lib/sportdb/parser/parser.rb
67
+ - lib/sportdb/parser/token-date.rb
68
+ - lib/sportdb/parser/token-score.rb
69
+ - lib/sportdb/parser/token-text.rb
70
+ - lib/sportdb/parser/token.rb
71
+ homepage: https://github.com/sportdb/sport.db
72
+ licenses:
73
+ - Public Domain
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options:
77
+ - "--main"
78
+ - README.md
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 2.2.2
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubygems_version: 3.4.10
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: sportdb-parser - football.txt match parser (& tokenizer)
96
+ test_files: []