sportdb-parser 0.1.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Rakefile +2 -2
- data/bin/fbt +6 -7
- data/lib/sportdb/parser/lang.rb +68 -19
- data/lib/sportdb/parser/outline_reader.rb +1 -5
- data/lib/sportdb/parser/parser.rb +45 -27
- data/lib/sportdb/parser/token-date.rb +99 -21
- data/lib/sportdb/parser/token.rb +64 -51
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +3 -3
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3657cedc5125ee2515efa8be4a1838d05b7290523dd893f7eba5b87024e71238
|
4
|
+
data.tar.gz: caf6d7e909e17fa0dcabf659ab8f5046ca1940d8f7c1c6f5312e485dc0089384
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4063565aada304a1eb96009b6fe542392f41a55d4ad4d21b5de156004bd69a055c5f86b076bed1defbe50423c8c891dd538931ea6ca9b8ec41e237c23e699219
|
7
|
+
data.tar.gz: 91f6476810cb6617dfcd703ada57592cd38b87f3b4b9fc6fd4468a9457ff0e6ae6337a4e4f5c782e1b80f5f6b6015d5ce26ed6330915cd67a5fb6606f665017f
|
data/CHANGELOG.md
CHANGED
data/Rakefile
CHANGED
@@ -21,11 +21,11 @@ Hoe.spec 'sportdb-parser' do
|
|
21
21
|
self.licenses = ['Public Domain']
|
22
22
|
|
23
23
|
self.extra_deps = [
|
24
|
-
['cocos'],
|
24
|
+
['cocos', '>= 0.4.0'],
|
25
25
|
['season-formats'],
|
26
26
|
]
|
27
27
|
|
28
28
|
self.spec_extras = {
|
29
|
-
required_ruby_version: '>=
|
29
|
+
required_ruby_version: '>= 3.1.0'
|
30
30
|
}
|
31
31
|
end
|
data/bin/fbt
CHANGED
@@ -11,7 +11,7 @@ require 'sportdb/parser'
|
|
11
11
|
require 'optparse'
|
12
12
|
|
13
13
|
##
|
14
|
-
## read textfile
|
14
|
+
## read textfile
|
15
15
|
## and dump tokens
|
16
16
|
##
|
17
17
|
## fbt ../openfootball/.../euro.txt
|
@@ -32,7 +32,7 @@ require 'optparse'
|
|
32
32
|
|
33
33
|
|
34
34
|
parser.on( "--verbose", "--debug",
|
35
|
-
"turn on verbose / debug output (default: #{opts[:debug]}
|
35
|
+
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
36
36
|
opts[:debug] = debug
|
37
37
|
end
|
38
38
|
|
@@ -53,18 +53,15 @@ p args
|
|
53
53
|
|
54
54
|
|
55
55
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
56
|
paths = if args.empty?
|
60
57
|
[
|
61
|
-
'../../../openfootball/euro/
|
58
|
+
'../../../openfootball/euro/2021--europe/euro.txt',
|
62
59
|
'../../../openfootball/euro/2024--germany/euro.txt',
|
63
60
|
]
|
64
61
|
else
|
65
62
|
## check for directories
|
66
63
|
## and auto-expand
|
67
|
-
|
64
|
+
|
68
65
|
SportDb::Parser::Opts.expand_args( args )
|
69
66
|
end
|
70
67
|
|
@@ -86,8 +83,10 @@ end
|
|
86
83
|
if errors.size > 0
|
87
84
|
puts
|
88
85
|
pp errors
|
86
|
+
puts
|
89
87
|
puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
90
88
|
else
|
89
|
+
puts
|
91
90
|
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
92
91
|
end
|
93
92
|
|
data/lib/sportdb/parser/lang.rb
CHANGED
@@ -15,7 +15,7 @@ class Parser
|
|
15
15
|
|
16
16
|
GROUP_RE = %r{^
|
17
17
|
Group [ ]
|
18
|
-
(?<key>[a-z0-9]+)
|
18
|
+
(?<key>[a-z0-9]+)
|
19
19
|
$}ix
|
20
20
|
def is_group?( text )
|
21
21
|
## use regex for match
|
@@ -27,43 +27,81 @@ end
|
|
27
27
|
|
28
28
|
ROUND_RE = %r{^(
|
29
29
|
|
30
|
+
## add special case for group play-off rounds!
|
31
|
+
## group 2 play-off (e.g. worldcup 1954, 1958)
|
32
|
+
(?: Group [ ] [a-z0-9]+ [ ]
|
33
|
+
Play-?offs?
|
34
|
+
)
|
35
|
+
|
|
30
36
|
# round - note - requiers number e.g. round 1,2, etc.
|
37
|
+
# note - use 1-9 regex (cannot start with 0) - why? why not?
|
38
|
+
# make week 01 or round 01 or matchday 01 possible?
|
31
39
|
(?: (?: Round |
|
32
40
|
Matchday |
|
33
41
|
Week
|
34
42
|
)
|
35
|
-
[ ] [0-9]
|
43
|
+
[ ] [1-9][0-9]*
|
44
|
+
)
|
45
|
+
|
|
46
|
+
## starting with qual(ification)
|
47
|
+
## Qual. Round 1 / Qual. Round 2 / Qual. Round 3
|
48
|
+
(?: Qual \. [ ]
|
49
|
+
Round
|
50
|
+
[ ] [1-9][0-9]*
|
36
51
|
)
|
37
52
|
|
|
53
|
+
## 1. Round / 2. Round / 3. Round / etc.
|
54
|
+
## Play-off Round
|
55
|
+
## First Round
|
56
|
+
## Final Round (e.g. Worldcup 1950)
|
57
|
+
(?:
|
58
|
+
(?: [1-9][0-9]* \. |
|
59
|
+
Play-?off |
|
60
|
+
1st | First |
|
61
|
+
2nd | Second |
|
62
|
+
Final
|
63
|
+
)
|
64
|
+
[ ] Round
|
65
|
+
)
|
66
|
+
|
|
67
|
+
## starting with preliminary
|
68
|
+
# e.g. Preliminary round
|
69
|
+
(?: Preliminary [ ]
|
70
|
+
(?: Round |
|
71
|
+
Semi-?finals |
|
72
|
+
Final
|
73
|
+
)
|
74
|
+
)
|
75
|
+
|
|
38
76
|
# more (kockout) rounds
|
39
77
|
# playoffs - playoff, play-off, play-offs
|
40
|
-
(?: Play-?offs?
|
78
|
+
(?: Play-?offs?
|
41
79
|
(?: [ ]for[ ]quarter-?finals )?
|
42
80
|
)
|
43
|
-
|
|
81
|
+
|
|
44
82
|
# round32
|
45
|
-
(?: Round[ ]of[ ]32 |
|
83
|
+
(?: Round[ ]of[ ]32 |
|
46
84
|
Last[ ]32 )
|
47
85
|
|
|
48
|
-
# round16
|
86
|
+
# round16
|
49
87
|
(?: Round[ ]of[ ]16 |
|
50
|
-
Last[ ]16 |
|
88
|
+
Last[ ]16 |
|
51
89
|
8th[ ]finals )
|
52
90
|
|
|
53
91
|
# fifthplace
|
54
92
|
(?:
|
55
|
-
(?: (Fifth|5th)[ -]place
|
93
|
+
(?: (Fifth|5th)[ -]place
|
56
94
|
(?: [ ] (?: match|play-?off|final ))?
|
57
95
|
) |
|
58
96
|
(?: Match[ ]for[ ](?: fifth|5th )[ -]place )
|
59
97
|
)
|
60
98
|
|
|
61
99
|
# thirdplace
|
62
|
-
(?:
|
63
|
-
(?: (Third|3rd)[ -]place
|
64
|
-
(?: [ ] (?: match|play-?off|final ))?
|
100
|
+
(?:
|
101
|
+
(?: (Third|3rd)[ -]place
|
102
|
+
(?: [ ] (?: match|play-?off|final ))?
|
65
103
|
) |
|
66
|
-
(?: Match[ ]for[ ](?: third|3rd )[ -]place )
|
104
|
+
(?: Match[ ]for[ ](?: third|3rd )[ -]place )
|
67
105
|
)
|
68
106
|
|
|
69
107
|
# quarterfinals
|
@@ -72,18 +110,29 @@ ROUND_RE = %r{^(
|
|
72
110
|
Quarters |
|
73
111
|
Last[ ]8
|
74
112
|
)
|
75
|
-
|
|
113
|
+
|
|
76
114
|
# semifinals
|
77
|
-
(?:
|
115
|
+
(?:
|
78
116
|
Semi-?finals? |
|
79
117
|
Semis |
|
80
118
|
Last[ ]4
|
81
119
|
)
|
82
120
|
|
|
83
121
|
# final
|
84
|
-
Finals?
|
85
|
-
|
86
|
-
|
122
|
+
Finals?
|
123
|
+
|
|
124
|
+
## add replays
|
125
|
+
## e.g. Final Replay
|
126
|
+
## Quarter-finals replays
|
127
|
+
## First round replays
|
128
|
+
(?:
|
129
|
+
(?: First [ ] Round |
|
130
|
+
Quarter-?finals? |
|
131
|
+
Finals?
|
132
|
+
)
|
133
|
+
[ ] Replays?
|
134
|
+
)
|
135
|
+
)$}ix
|
87
136
|
|
88
137
|
|
89
138
|
def is_round?( text )
|
@@ -95,9 +144,9 @@ end
|
|
95
144
|
##
|
96
145
|
LEG_RE = %r{^
|
97
146
|
# leg1
|
98
|
-
(?: 1st|First)[ ]leg
|
147
|
+
(?: 1st|First)[ ]leg
|
99
148
|
|
|
100
|
-
# leg2
|
149
|
+
# leg2
|
101
150
|
(?: 2nd|Second)[ ]leg
|
102
151
|
$}ix
|
103
152
|
|
@@ -1,8 +1,4 @@
|
|
1
1
|
|
2
|
-
###
|
3
|
-
## todo/fix - move to sportdb-parser - why? why not? !!!!!!
|
4
|
-
##
|
5
|
-
|
6
2
|
|
7
3
|
module SportDb
|
8
4
|
|
@@ -10,7 +6,7 @@ class OutlineReader
|
|
10
6
|
|
11
7
|
def self.debug=(value) @@debug = value; end
|
12
8
|
def self.debug?() @@debug ||= false; end
|
13
|
-
def debug?() self.class.debug?; end
|
9
|
+
def debug?() self.class.debug?; end
|
14
10
|
|
15
11
|
|
16
12
|
|
@@ -1,24 +1,24 @@
|
|
1
|
-
module SportDb
|
1
|
+
module SportDb
|
2
2
|
class Parser
|
3
|
-
|
3
|
+
|
4
4
|
|
5
5
|
## transforms
|
6
6
|
##
|
7
7
|
## Netherlands 1-2 (1-1) England
|
8
|
-
## => text => team
|
9
|
-
## score|vs
|
8
|
+
## => text => team
|
9
|
+
## score|vs
|
10
10
|
## text => team
|
11
11
|
|
12
12
|
|
13
13
|
## token iter/find better name
|
14
14
|
## e.g. TokenBuffer/Scanner or such ??
|
15
|
-
class Tokens
|
15
|
+
class Tokens
|
16
16
|
def initialize( tokens )
|
17
17
|
@tokens = tokens
|
18
18
|
@pos = 0
|
19
19
|
end
|
20
20
|
|
21
|
-
def pos() @pos; end
|
21
|
+
def pos() @pos; end
|
22
22
|
def eos?() @pos >= @tokens.size; end
|
23
23
|
|
24
24
|
|
@@ -47,17 +47,17 @@ class Tokens
|
|
47
47
|
## return token type (e.g. :text, :num, etc.)
|
48
48
|
def cur() peek(0); end
|
49
49
|
## return content (assumed to be text)
|
50
|
-
def text(offset=0)
|
50
|
+
def text(offset=0)
|
51
51
|
## raise error - why? why not?
|
52
52
|
## return nil?
|
53
53
|
if peek( offset ) != :text
|
54
54
|
raise ArgumentError, "text(#{offset}) - token not a text type"
|
55
55
|
end
|
56
|
-
@tokens[@pos+offset][1]
|
56
|
+
@tokens[@pos+offset][1]
|
57
57
|
end
|
58
58
|
|
59
59
|
|
60
|
-
def peek(offset=1)
|
60
|
+
def peek(offset=1)
|
61
61
|
## return nil if eos
|
62
62
|
if @pos+offset >= @tokens.size
|
63
63
|
nil
|
@@ -66,7 +66,7 @@ class Tokens
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
-
## note - returns complete token
|
69
|
+
## note - returns complete token
|
70
70
|
def next
|
71
71
|
# if @pos >= @tokens.size
|
72
72
|
# raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
|
@@ -81,7 +81,7 @@ class Tokens
|
|
81
81
|
def collect( &blk )
|
82
82
|
tokens = []
|
83
83
|
loop do
|
84
|
-
break if eos?
|
84
|
+
break if eos?
|
85
85
|
tokens << if block_given?
|
86
86
|
blk.call( self.next )
|
87
87
|
else
|
@@ -106,7 +106,7 @@ def parse_with_errors( line, debug: false )
|
|
106
106
|
errors += token_errors
|
107
107
|
|
108
108
|
#############
|
109
|
-
## pass 1
|
109
|
+
## pass 1
|
110
110
|
## replace all texts with keyword matches (e.g. group, round, leg, etc.)
|
111
111
|
tokens = tokens.map do |t|
|
112
112
|
if t[0] == :text
|
@@ -129,24 +129,40 @@ def parse_with_errors( line, debug: false )
|
|
129
129
|
## puts "tokens:"
|
130
130
|
## pp tokens
|
131
131
|
|
132
|
-
## transform tokens into (parse tree/ast) nodes
|
132
|
+
## transform tokens into (parse tree/ast) nodes
|
133
133
|
nodes = []
|
134
|
-
|
134
|
+
|
135
135
|
buf = Tokens.new( tokens )
|
136
136
|
## pp buf
|
137
137
|
|
138
138
|
|
139
|
-
loop do
|
140
|
-
if buf.
|
141
|
-
|
142
|
-
|
143
|
-
|
139
|
+
loop do
|
140
|
+
break if buf.eos?
|
141
|
+
|
142
|
+
## simplify - remove separator for round + leg pair
|
143
|
+
## e.g. Round of 16, 1st Leg
|
144
|
+
## allow Round of 16 - 1st Leg too - why? why not?
|
145
|
+
if buf.match?( :round, [:',', :'|',
|
146
|
+
:'-',
|
147
|
+
:vs, ### fix - change parser to issue :'-' only for (-) not :vs!!!
|
148
|
+
], :leg )
|
149
|
+
nodes << [:round, buf.next[1]]
|
150
|
+
buf.next ## swallow separator
|
151
|
+
nodes << [:leg, buf.next[1]]
|
152
|
+
next
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
if buf.pos == 0 ## MUST start line
|
157
|
+
## check for
|
158
|
+
## group def or round def
|
159
|
+
if buf.match?( :round, :'|', [:date, :duration] ) ## assume round def (change round to round_def)
|
144
160
|
nodes << [:round_def, buf.next[1]]
|
145
161
|
buf.next ## swallow pipe
|
146
162
|
nodes += buf.collect
|
147
163
|
break
|
148
164
|
end
|
149
|
-
if buf.match?( :group, :'|' ) ## assume group def (change group to group_def)
|
165
|
+
if buf.match?( :group, :'|', :text ) ## assume group def (change group to group_def)
|
150
166
|
nodes << [:group_def, buf.next[1]]
|
151
167
|
buf.next ## swallow pipe
|
152
168
|
## change all text to team
|
@@ -154,11 +170,15 @@ def parse_with_errors( line, debug: false )
|
|
154
170
|
t[0] == :text ? [:team, t[1]] : t
|
155
171
|
}
|
156
172
|
break
|
157
|
-
end
|
173
|
+
end
|
158
174
|
end
|
159
175
|
|
160
176
|
|
161
|
-
if buf.match?( :text,
|
177
|
+
if buf.match?( :text, :'-', :text ) ## hacky? convert "generic" :- to :vs
|
178
|
+
nodes << [:team, buf.next[1]] ## keep this rule/option - why? why not?
|
179
|
+
nodes << [:vs]
|
180
|
+
nodes << [:team, buf.next[1]]
|
181
|
+
elsif buf.match?( :text, [:score, :vs], :text )
|
162
182
|
nodes << [:team, buf.next[1]]
|
163
183
|
nodes << buf.next
|
164
184
|
nodes << [:team, buf.next[1]]
|
@@ -170,14 +190,12 @@ def parse_with_errors( line, debug: false )
|
|
170
190
|
## only change text to geo
|
171
191
|
nodes += buf.collect { |t|
|
172
192
|
t[0] == :text ? [:geo, t[1]] : t
|
173
|
-
}
|
193
|
+
}
|
174
194
|
break
|
175
195
|
else
|
176
196
|
## pass through
|
177
197
|
nodes << buf.next
|
178
198
|
end
|
179
|
-
|
180
|
-
break if buf.eos?
|
181
199
|
end
|
182
200
|
|
183
201
|
[nodes,errors]
|
@@ -192,5 +210,5 @@ end
|
|
192
210
|
|
193
211
|
|
194
212
|
end # class Parser
|
195
|
-
end # module SportDb
|
196
|
-
|
213
|
+
end # module SportDb
|
214
|
+
|
@@ -1,6 +1,6 @@
|
|
1
|
-
module SportDb
|
1
|
+
module SportDb
|
2
2
|
class Parser
|
3
|
-
|
3
|
+
|
4
4
|
|
5
5
|
|
6
6
|
def self.parse_names( txt )
|
@@ -47,8 +47,8 @@ def self.build_map( lines, downcase: false )
|
|
47
47
|
## "may" => 5,
|
48
48
|
## "june" => 6, "jun" => 6, ...
|
49
49
|
lines.each_with_index.reduce( {} ) do |h,(line,i)|
|
50
|
-
line.each do |name|
|
51
|
-
h[ downcase ? name.downcase : name ] = i+1
|
50
|
+
line.each do |name|
|
51
|
+
h[ downcase ? name.downcase : name ] = i+1
|
52
52
|
end ## note: start mapping with 1 (and NOT zero-based, that is, 0)
|
53
53
|
h
|
54
54
|
end
|
@@ -109,28 +109,56 @@ DAY_MAP = build_map( DAY_LINES, downcase: true )
|
|
109
109
|
## todo - add more date variants !!!!
|
110
110
|
|
111
111
|
# e.g. Fri Aug/9 or Fri Aug 9
|
112
|
-
|
112
|
+
DATE_I_RE = %r{
|
113
113
|
(?<date>
|
114
114
|
\b
|
115
115
|
## optional day name
|
116
116
|
((?<day_name>#{DAY_NAMES})
|
117
117
|
[ ]
|
118
|
-
)?
|
118
|
+
)?
|
119
119
|
(?<month_name>#{MONTH_NAMES})
|
120
120
|
(?: \/|[ ] )
|
121
121
|
(?<day>\d{1,2})
|
122
122
|
## optional year
|
123
123
|
( [ ]
|
124
124
|
(?<year>\d{4})
|
125
|
-
)?
|
126
|
-
\b
|
125
|
+
)?
|
126
|
+
\b
|
127
|
+
)}ix
|
128
|
+
|
129
|
+
|
130
|
+
# e.g. 3 June or 10 June
|
131
|
+
DATE_II_RE = %r{
|
132
|
+
(?<date>
|
133
|
+
\b
|
134
|
+
## optional day name
|
135
|
+
((?<day_name>#{DAY_NAMES})
|
136
|
+
[ ]
|
137
|
+
)?
|
138
|
+
(?<day>\d{1,2})
|
139
|
+
[ ]
|
140
|
+
(?<month_name>#{MONTH_NAMES})
|
141
|
+
## optional year
|
142
|
+
( [ ]
|
143
|
+
(?<year>\d{4})
|
144
|
+
)?
|
145
|
+
\b
|
127
146
|
)}ix
|
128
147
|
|
129
148
|
|
149
|
+
#############################################
|
150
|
+
# map tables
|
151
|
+
# note: order matters; first come-first matched/served
|
152
|
+
DATE_RE = Regexp.union(
|
153
|
+
DATE_I_RE,
|
154
|
+
DATE_II_RE
|
155
|
+
)
|
156
|
+
|
157
|
+
|
130
158
|
###
|
131
|
-
# date duration
|
159
|
+
# date duration
|
132
160
|
# use - or + as separator
|
133
|
-
# in theory plus( +) only if dates
|
161
|
+
# in theory plus( +) only if dates
|
134
162
|
# are two days next to each other
|
135
163
|
#
|
136
164
|
# otherwise define new dates type in the future? why? why not?
|
@@ -147,7 +175,7 @@ DATE_RE = %r{
|
|
147
175
|
# Jun/25 .. 26 - why? why not???
|
148
176
|
# Jun/25 to 26 - why? why not???
|
149
177
|
# Jun/25 + 26 - add - why? why not???
|
150
|
-
# Sun-Wed Jun/23-26 - add - why? why not???
|
178
|
+
# Sun-Wed Jun/23-26 - add - why? why not???
|
151
179
|
# Wed+Thu Jun/26+27 2024 - add - why? why not???
|
152
180
|
#
|
153
181
|
# maybe use comman and plus for list of dates
|
@@ -157,39 +185,89 @@ DATE_RE = %r{
|
|
157
185
|
# add back optional comma (before) year - why? why not?
|
158
186
|
|
159
187
|
|
160
|
-
|
188
|
+
##
|
189
|
+
# todo add plus later on - why? why not?
|
190
|
+
|
191
|
+
DURATION_I_RE = %r{
|
161
192
|
(?<duration>
|
162
193
|
\b
|
163
194
|
## optional day name
|
164
195
|
((?<day_name1>#{DAY_NAMES})
|
165
196
|
[ ]
|
166
|
-
)?
|
197
|
+
)?
|
167
198
|
(?<month_name1>#{MONTH_NAMES})
|
168
199
|
(?: \/|[ ] )
|
169
200
|
(?<day1>\d{1,2})
|
170
201
|
## optional year
|
171
202
|
( [ ]
|
172
203
|
(?<year1>\d{4})
|
173
|
-
)?
|
204
|
+
)?
|
174
205
|
|
175
206
|
## support + and - (add .. or such - why??)
|
176
|
-
[ ]*[
|
177
|
-
|
207
|
+
[ ]*[-][ ]*
|
208
|
+
|
178
209
|
## optional day name
|
179
210
|
((?<day_name2>#{DAY_NAMES})
|
180
211
|
[ ]
|
181
|
-
)?
|
212
|
+
)?
|
182
213
|
(?<month_name2>#{MONTH_NAMES})
|
183
214
|
(?: \/|[ ] )
|
184
215
|
(?<day2>\d{1,2})
|
185
216
|
## optional year
|
186
217
|
( [ ]
|
187
218
|
(?<year2>\d{4})
|
188
|
-
)?
|
189
|
-
\b
|
219
|
+
)?
|
220
|
+
\b
|
221
|
+
)}ix
|
222
|
+
|
223
|
+
|
224
|
+
###
|
225
|
+
# variant ii
|
226
|
+
# e.g. 26 July - 27 July
|
227
|
+
|
228
|
+
DURATION_II_RE = %r{
|
229
|
+
(?<duration>
|
230
|
+
\b
|
231
|
+
## optional day name
|
232
|
+
((?<day_name1>#{DAY_NAMES})
|
233
|
+
[ ]
|
234
|
+
)?
|
235
|
+
(?<day1>\d{1,2})
|
236
|
+
[ ]
|
237
|
+
(?<month_name1>#{MONTH_NAMES})
|
238
|
+
## optional year
|
239
|
+
( [ ]
|
240
|
+
(?<year1>\d{4})
|
241
|
+
)?
|
242
|
+
|
243
|
+
## support + and - (add .. or such - why??)
|
244
|
+
[ ]*[-][ ]*
|
245
|
+
|
246
|
+
## optional day name
|
247
|
+
((?<day_name2>#{DAY_NAMES})
|
248
|
+
[ ]
|
249
|
+
)?
|
250
|
+
(?<day2>\d{1,2})
|
251
|
+
[ ]
|
252
|
+
(?<month_name2>#{MONTH_NAMES})
|
253
|
+
## optional year
|
254
|
+
( [ ]
|
255
|
+
(?<year2>\d{4})
|
256
|
+
)?
|
257
|
+
\b
|
190
258
|
)}ix
|
191
259
|
|
192
260
|
|
261
|
+
#############################################
|
262
|
+
# map tables
|
263
|
+
# note: order matters; first come-first matched/served
|
264
|
+
DURATION_RE = Regexp.union(
|
265
|
+
DURATION_I_RE,
|
266
|
+
DURATION_II_RE
|
267
|
+
)
|
268
|
+
|
269
|
+
|
270
|
+
|
193
271
|
end # class Parser
|
194
|
-
end # module SportDb
|
195
|
-
|
272
|
+
end # module SportDb
|
273
|
+
|
data/lib/sportdb/parser/token.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
module SportDb
|
3
|
+
module SportDb
|
4
4
|
class Parser
|
5
5
|
|
6
6
|
|
@@ -15,7 +15,7 @@ TIME_RE = %r{
|
|
15
15
|
(?: :|\.|h )
|
16
16
|
(?<minute>\d{2})
|
17
17
|
\b
|
18
|
-
)
|
18
|
+
)
|
19
19
|
}ix
|
20
20
|
|
21
21
|
|
@@ -28,7 +28,7 @@ TIME_RE = %r{
|
|
28
28
|
# (CEST/UTC+2) - central european summer time - daylight saving time (DST).
|
29
29
|
# (EET/UTC+1) - eastern european time
|
30
30
|
# (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
|
31
|
-
#
|
31
|
+
#
|
32
32
|
# UTC+3
|
33
33
|
# UTC+4
|
34
34
|
# UTC+0
|
@@ -45,7 +45,7 @@ TIME_RE = %r{
|
|
45
45
|
|
46
46
|
TIMEZONE_RE = %r{
|
47
47
|
## e.g. (UTC-2) or (CEST/UTC-2) etc.
|
48
|
-
(?<timezone>
|
48
|
+
(?<timezone>
|
49
49
|
\(
|
50
50
|
## optional "local" timezone name eg. BRT or CEST etc.
|
51
51
|
(?: [a-z]+
|
@@ -63,28 +63,28 @@ TIMEZONE_RE = %r{
|
|
63
63
|
|
64
64
|
BASICS_RE = %r{
|
65
65
|
## e.g. (51) or (1) etc. - limit digits of number???
|
66
|
-
(?<num> \( (?<value>\d+) \) )
|
66
|
+
(?<num> \( (?<value>\d+) \) )
|
67
67
|
|
|
68
|
-
(?<vs>
|
69
|
-
(?<=[ ]) # Positive lookbehind for space
|
70
|
-
(?:
|
68
|
+
(?<vs>
|
69
|
+
(?<=[ ]) # Positive lookbehind for space
|
70
|
+
(?:
|
71
71
|
vs\.?| ## allow optional dot (eg. vs. v.)
|
72
72
|
v\.?|
|
73
73
|
-
|
74
74
|
) # not bigger match first e.g. vs than v etc.
|
75
75
|
(?=[ ]) # positive lookahead for space
|
76
|
-
)
|
77
|
-
|
|
76
|
+
)
|
77
|
+
|
|
78
78
|
(?<none>
|
79
|
-
(?<=[ \[]|^) # Positive lookbehind for space or [
|
79
|
+
(?<=[ \[]|^) # Positive lookbehind for space or [
|
80
80
|
-
|
81
81
|
(?=[ ]*;) # positive lookahead for space
|
82
82
|
)
|
83
83
|
|
|
84
84
|
(?<spaces> [ ]{2,}) |
|
85
|
-
(?<space> [ ])
|
85
|
+
(?<space> [ ])
|
86
86
|
|
|
87
|
-
(?<sym>[;,@|\[\]])
|
87
|
+
(?<sym>[;,@|\[\]])
|
88
88
|
}ix
|
89
89
|
|
90
90
|
|
@@ -94,13 +94,13 @@ MINUTE_RE = %r{
|
|
94
94
|
(?<value>\d{1,3}) ## constrain numbers to 0 to 999!!!
|
95
95
|
(?: \+
|
96
96
|
(?<value2>\d{1,3})
|
97
|
-
)?
|
97
|
+
)?
|
98
98
|
' ## must have minute marker!!!!
|
99
99
|
)
|
100
100
|
}ix
|
101
101
|
|
102
102
|
|
103
|
-
## (match) status
|
103
|
+
## (match) status
|
104
104
|
## note: english usage - cancelled (in UK), canceled (in US)
|
105
105
|
##
|
106
106
|
## add more variants - why? why not?
|
@@ -115,30 +115,30 @@ STATUS_RE = %r{
|
|
115
115
|
|
|
116
116
|
postponed
|
117
117
|
|
|
118
|
-
awarded|awd\.
|
118
|
+
awarded|awd\.
|
119
119
|
|
|
120
|
-
replay
|
120
|
+
replay
|
121
121
|
)
|
122
122
|
(?=[ \]]|$)
|
123
123
|
)}ix
|
124
124
|
|
125
125
|
## todo/check: remove loakahead assertion here - why require space?
|
126
|
-
## note: \b works only after non-alphanum
|
127
|
-
## to make it work with awd. (dot) "custom" lookahead neeeded
|
126
|
+
## note: \b works only after non-alphanum
|
127
|
+
## to make it work with awd. (dot) "custom" lookahead neeeded
|
128
128
|
|
129
129
|
|
130
130
|
## goal types
|
131
|
-
# (pen.) or (pen) or (p.) or (p)
|
131
|
+
# (pen.) or (pen) or (p.) or (p)
|
132
132
|
## (o.g.) or (og)
|
133
133
|
GOAL_PEN_RE = %r{
|
134
|
-
(?<pen> \(
|
135
|
-
(?:pen|p)\.?
|
134
|
+
(?<pen> \(
|
135
|
+
(?:pen|p)\.?
|
136
136
|
\)
|
137
137
|
)
|
138
138
|
}ix
|
139
139
|
GOAL_OG_RE = %r{
|
140
|
-
(?<og> \(
|
141
|
-
(?:og|o\.g\.)
|
140
|
+
(?<og> \(
|
141
|
+
(?:og|o\.g\.)
|
142
142
|
\)
|
143
143
|
)
|
144
144
|
}ix
|
@@ -158,11 +158,11 @@ RE = Regexp.union( STATUS_RE,
|
|
158
158
|
|
159
159
|
|
160
160
|
def log( msg )
|
161
|
-
## append msg to ./logs.txt
|
161
|
+
## append msg to ./logs.txt
|
162
162
|
## use ./errors.txt - why? why not?
|
163
163
|
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
164
164
|
f.write( msg )
|
165
|
-
f.write( "\n" )
|
165
|
+
f.write( "\n" )
|
166
166
|
end
|
167
167
|
end
|
168
168
|
|
@@ -176,7 +176,7 @@ def tokenize_with_errors( line, typed: false,
|
|
176
176
|
puts ">#{line}<" if debug
|
177
177
|
|
178
178
|
pos = 0
|
179
|
-
## track last offsets - to report error on no match
|
179
|
+
## track last offsets - to report error on no match
|
180
180
|
## or no match in end of string
|
181
181
|
offsets = [0,0]
|
182
182
|
m = nil
|
@@ -184,7 +184,7 @@ def tokenize_with_errors( line, typed: false,
|
|
184
184
|
while m = RE.match( line, pos )
|
185
185
|
if debug
|
186
186
|
pp m
|
187
|
-
puts "pos: #{pos}"
|
187
|
+
puts "pos: #{pos}"
|
188
188
|
end
|
189
189
|
offsets = [m.begin(0), m.end(0)]
|
190
190
|
|
@@ -213,10 +213,10 @@ def tokenize_with_errors( line, typed: false,
|
|
213
213
|
elsif m[:spaces]
|
214
214
|
## skip spaces
|
215
215
|
nil
|
216
|
-
elsif m[:text]
|
216
|
+
elsif m[:text]
|
217
217
|
[:text, m[:text]] ## keep pos - why? why not?
|
218
218
|
elsif m[:status] ## (match) status e.g. cancelled, awarded, etc.
|
219
|
-
[:status, m[:status]]
|
219
|
+
[:status, m[:status]]
|
220
220
|
elsif m[:time]
|
221
221
|
if typed
|
222
222
|
## unify to iso-format
|
@@ -230,7 +230,7 @@ def tokenize_with_errors( line, typed: false,
|
|
230
230
|
if (hour >= 0 && hour <= 24) &&
|
231
231
|
(minute >=0 && minute <= 59)
|
232
232
|
## note - for debugging keep (pass along) "literal" time
|
233
|
-
## might use/add support for am/pm later
|
233
|
+
## might use/add support for am/pm later
|
234
234
|
[:time, m[:time], {h:hour,m:minute}]
|
235
235
|
else
|
236
236
|
raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
|
@@ -241,54 +241,68 @@ def tokenize_with_errors( line, typed: false,
|
|
241
241
|
elsif m[:date]
|
242
242
|
if typed
|
243
243
|
date = {}
|
244
|
-
=begin
|
244
|
+
=begin
|
245
245
|
((?<day_name>#{DAY_NAMES})
|
246
246
|
[ ]
|
247
|
-
)?
|
247
|
+
)?
|
248
248
|
(?<month_name>#{MONTH_NAMES})
|
249
249
|
(?: \/|[ ] )
|
250
250
|
(?<day>\d{1,2})
|
251
251
|
## optional year
|
252
252
|
( [ ]
|
253
253
|
(?<year>\d{4})
|
254
|
-
)?
|
254
|
+
)?
|
255
255
|
=end
|
256
256
|
## map month names
|
257
257
|
## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
|
258
|
-
date[:y] = m[:year].to_i(10) if m[:year]
|
258
|
+
date[:y] = m[:year].to_i(10) if m[:year]
|
259
259
|
date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
260
260
|
date[:d] = m[:day].to_i(10) if m[:day]
|
261
261
|
date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
262
|
-
## note - for debugging keep (pass along) "literal" date
|
263
|
-
[:date, m[:date], date]
|
262
|
+
## note - for debugging keep (pass along) "literal" date
|
263
|
+
[:date, m[:date], date]
|
264
264
|
else
|
265
265
|
[:date, m[:date]]
|
266
266
|
end
|
267
267
|
elsif m[:timezone]
|
268
268
|
[:timezone, m[:timezone]]
|
269
269
|
elsif m[:duration]
|
270
|
-
|
270
|
+
if typed
|
271
|
+
duration = { start: {}, end: {}}
|
272
|
+
duration[:start][:y] = m[:year1].to_i(10) if m[:year1]
|
273
|
+
duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1]
|
274
|
+
duration[:start][:d] = m[:day1].to_i(10) if m[:day1]
|
275
|
+
duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1]
|
276
|
+
duration[:end][:y] = m[:year2].to_i(10) if m[:year2]
|
277
|
+
duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2]
|
278
|
+
duration[:end][:d] = m[:day2].to_i(10) if m[:day2]
|
279
|
+
duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2]
|
280
|
+
## note - for debugging keep (pass along) "literal" duration
|
281
|
+
[:duration, m[:duration], duration]
|
282
|
+
else
|
283
|
+
[:duration, m[:duration]]
|
284
|
+
end
|
271
285
|
elsif m[:num]
|
272
286
|
if typed
|
273
287
|
## note - strip enclosing () and convert to integer
|
274
288
|
[:num, m[:value].to_i(10)]
|
275
|
-
else
|
289
|
+
else
|
276
290
|
[:num, m[:num]]
|
277
291
|
end
|
278
292
|
elsif m[:score]
|
279
293
|
if typed
|
280
294
|
score = {}
|
281
295
|
## check for pen
|
282
|
-
score[:p] = [m[:p1].to_i(10),
|
296
|
+
score[:p] = [m[:p1].to_i(10),
|
283
297
|
m[:p2].to_i(10)] if m[:p1] && m[:p2]
|
284
|
-
score[:et] = [m[:et1].to_i(10),
|
298
|
+
score[:et] = [m[:et1].to_i(10),
|
285
299
|
m[:et2].to_i(10)] if m[:et1] && m[:et2]
|
286
|
-
score[:ft] = [m[:ft1].to_i(10),
|
300
|
+
score[:ft] = [m[:ft1].to_i(10),
|
287
301
|
m[:ft2].to_i(10)] if m[:ft1] && m[:ft2]
|
288
|
-
score[:ht] = [m[:ht1].to_i(10),
|
302
|
+
score[:ht] = [m[:ht1].to_i(10),
|
289
303
|
m[:ht2].to_i(10)] if m[:ht1] && m[:ht2]
|
290
304
|
|
291
|
-
## note - for debugging keep (pass along) "literal" score
|
305
|
+
## note - for debugging keep (pass along) "literal" score
|
292
306
|
[:score, m[:score], score]
|
293
307
|
else
|
294
308
|
[:score, m[:score]]
|
@@ -298,7 +312,7 @@ def tokenize_with_errors( line, typed: false,
|
|
298
312
|
minute = {}
|
299
313
|
minute[:m] = m[:value].to_i(10)
|
300
314
|
minute[:offset] = m[:value2].to_i(10) if m[:value2]
|
301
|
-
## note - for debugging keep (pass along) "literal" minute
|
315
|
+
## note - for debugging keep (pass along) "literal" minute
|
302
316
|
[:minute, m[:minute], minute]
|
303
317
|
else
|
304
318
|
[:minute, m[:minute]]
|
@@ -318,16 +332,16 @@ def tokenize_with_errors( line, typed: false,
|
|
318
332
|
when ',' then [:',']
|
319
333
|
when ';' then [:';']
|
320
334
|
when '@' then [:'@']
|
321
|
-
when '|' then [:'|']
|
335
|
+
when '|' then [:'|']
|
322
336
|
else
|
323
337
|
nil ## ignore others (e.g. brackets [])
|
324
338
|
end
|
325
339
|
else
|
326
|
-
## report error
|
340
|
+
## report error
|
327
341
|
nil
|
328
342
|
end
|
329
343
|
|
330
|
-
tokens << t if t
|
344
|
+
tokens << t if t
|
331
345
|
|
332
346
|
if debug
|
333
347
|
print ">"
|
@@ -346,7 +360,7 @@ def tokenize_with_errors( line, typed: false,
|
|
346
360
|
end
|
347
361
|
|
348
362
|
|
349
|
-
[tokens,errors]
|
363
|
+
[tokens,errors]
|
350
364
|
end
|
351
365
|
|
352
366
|
|
@@ -360,5 +374,4 @@ end
|
|
360
374
|
|
361
375
|
|
362
376
|
end # class Parser
|
363
|
-
end # module SportDb
|
364
|
-
|
377
|
+
end # module SportDb
|
data/lib/sportdb/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
## pulls in
|
2
|
-
require 'cocos'
|
1
|
+
## pulls in
|
2
|
+
require 'cocos'
|
3
3
|
require 'season/formats' # e.g. Season() support machinery
|
4
4
|
|
5
5
|
|
@@ -36,7 +36,7 @@ require_relative 'parser/opts'
|
|
36
36
|
=begin
|
37
37
|
module SportDb
|
38
38
|
def self.parser() @@parser ||= Parser.new; end
|
39
|
-
def self.parse( ... )
|
39
|
+
def self.parse( ... )
|
40
40
|
end
|
41
41
|
def self.tokenize( ... )
|
42
42
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.4.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.4.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: season-formats
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -112,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
112
|
requirements:
|
113
113
|
- - ">="
|
114
114
|
- !ruby/object:Gem::Version
|
115
|
-
version:
|
115
|
+
version: 3.1.0
|
116
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
117
|
requirements:
|
118
118
|
- - ">="
|