sportdb-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,196 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+ ## transforms
6
+ ##
7
+ ## Netherlands 1-2 (1-1) England
8
+ ## => text => team
9
+ ## score|vs
10
+ ## text => team
11
+
12
+
13
+ ## token iter/find better name
14
+ ## e.g. TokenBuffer/Scanner or such ??
15
+ class Tokens
16
+ def initialize( tokens )
17
+ @tokens = tokens
18
+ @pos = 0
19
+ end
20
+
21
+ def pos() @pos; end
22
+ def eos?() @pos >= @tokens.size; end
23
+
24
+
25
+ def include?( *types )
26
+ pos = @pos
27
+ ## puts " starting include? #{types.inspect} @ #{pos}"
28
+ while pos < @tokens.size do
29
+ return true if types.include?( @tokens[pos][0] )
30
+ pos +=1
31
+ end
32
+ false
33
+ end
34
+
35
+ ## pattern e.g. [:text, [:vs,:score], :text]
36
+ def match?( *pattern )
37
+ ## puts " starting match? #{pattern.inspect} @ #{@pos}"
38
+ pattern.each_with_index do |types,offset|
39
+ ## if single symbol wrap in array
40
+ types = types.is_a?(Array) ? types : [types]
41
+ return false unless types.include?( peek(offset) )
42
+ end
43
+ true
44
+ end
45
+
46
+
47
+ ## return token type (e.g. :text, :num, etc.)
48
+ def cur() peek(0); end
49
+ ## return content (assumed to be text)
50
+ def text(offset=0)
51
+ ## raise error - why? why not?
52
+ ## return nil?
53
+ if peek( offset ) != :text
54
+ raise ArgumentError, "text(#{offset}) - token not a text type"
55
+ end
56
+ @tokens[@pos+offset][1]
57
+ end
58
+
59
+
60
+ def peek(offset=1)
61
+ ## return nil if eos
62
+ if @pos+offset >= @tokens.size
63
+ nil
64
+ else
65
+ @tokens[@pos+offset][0]
66
+ end
67
+ end
68
+
69
+ ## note - returns complete token
70
+ def next
71
+ # if @pos >= @tokens.size
72
+ # raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
73
+ # end
74
+ # throw (standard) end of iteration here why? why not?
75
+
76
+ t = @tokens[@pos]
77
+ @pos += 1
78
+ t
79
+ end
80
+
81
+ def collect( &blk )
82
+ tokens = []
83
+ loop do
84
+ break if eos?
85
+ tokens << if block_given?
86
+ blk.call( self.next )
87
+ else
88
+ self.next
89
+ end
90
+ end
91
+ tokens
92
+ end
93
+ end # class Tokens
94
+
95
+
96
+
97
+ ##
98
+ ##
99
+ ## add !!!!
100
+ ## collect_until e.g. collect_until( :text )
101
+
102
+
103
+ def parse_with_errors( line, debug: false )
104
+ errors = []
105
+ tokens, token_errors = tokenize_with_errors( line, typed: true )
106
+ errors += token_errors
107
+
108
+ #############
109
+ ## pass 1
110
+ ## replace all texts with keyword matches (e.g. group, round, leg, etc.)
111
+ tokens = tokens.map do |t|
112
+ if t[0] == :text
113
+ text = t[1]
114
+ if is_group?( text )
115
+ [:group, text]
116
+ elsif is_leg?( text )
117
+ [:leg, text]
118
+ elsif is_round?( text )
119
+ [:round, text]
120
+ else
121
+ t ## pass through as-is (1:1)
122
+ end
123
+ else
124
+ t
125
+ end
126
+ end
127
+
128
+
129
+ ## puts "tokens:"
130
+ ## pp tokens
131
+
132
+ ## transform tokens into (parse tree/ast) nodes
133
+ nodes = []
134
+
135
+ buf = Tokens.new( tokens )
136
+ ## pp buf
137
+
138
+
139
+ loop do
140
+ if buf.pos == 0
141
+ ## check for
142
+ ## group def or round def
143
+ if buf.match?( :round, :'|' ) ## assume round def (change round to round_def)
144
+ nodes << [:round_def, buf.next[1]]
145
+ buf.next ## swallow pipe
146
+ nodes += buf.collect
147
+ break
148
+ end
149
+ if buf.match?( :group, :'|' ) ## assume group def (change group to group_def)
150
+ nodes << [:group_def, buf.next[1]]
151
+ buf.next ## swallow pipe
152
+ ## change all text to team
153
+ nodes += buf.collect { |t|
154
+ t[0] == :text ? [:team, t[1]] : t
155
+ }
156
+ break
157
+ end
158
+ end
159
+
160
+
161
+ if buf.match?( :text, [:score, :vs], :text )
162
+ nodes << [:team, buf.next[1]]
163
+ nodes << buf.next
164
+ nodes << [:team, buf.next[1]]
165
+ elsif buf.match?( :text, :minute )
166
+ nodes << [:player, buf.next[1]]
167
+ nodes << buf.next
168
+ elsif buf.cur == :'@'
169
+ ## add all to the end as is
170
+ ## only change text to geo
171
+ nodes += buf.collect { |t|
172
+ t[0] == :text ? [:geo, t[1]] : t
173
+ }
174
+ break
175
+ else
176
+ ## pass through
177
+ nodes << buf.next
178
+ end
179
+
180
+ break if buf.eos?
181
+ end
182
+
183
+ [nodes,errors]
184
+ end
185
+
186
+
187
+ ### convience helper - ignore errors by default
188
+ def parse( line, debug: false )
189
+ nodes, _ = parse_with_errors( line, debug: debug )
190
+ nodes
191
+ end
192
+
193
+
194
+ end # class Parser
195
+ end # module SportDb
196
+
@@ -0,0 +1,193 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+
6
+ def self.parse_names( txt )
7
+ lines = [] # array of lines (with words)
8
+
9
+ txt.each_line do |line|
10
+ line = line.strip
11
+
12
+ next if line.empty?
13
+ next if line.start_with?( '#' ) ## skip comments too
14
+
15
+ ## strip inline (until end-of-line) comments too
16
+ ## e.g. Janvier Janv Jan ## check janv in use??
17
+ ## => Janvier Janv Jan
18
+
19
+ line = line.sub( /#.*/, '' ).strip
20
+ ## pp line
21
+
22
+ values = line.split( /[ \t]+/ )
23
+ ## pp values
24
+
25
+ ## todo/fix -- add check for duplicates
26
+ lines << values
27
+ end
28
+ lines
29
+
30
+ end # method parse
31
+
32
+
33
+ def self.build_names( lines )
34
+ ## join all words together into a single string e.g.
35
+ ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
36
+ lines.map { |line| line.join('|') }.join('|')
37
+ end
38
+
39
+
40
+
41
+ ## add normalize option (for downcase) - why? why not?
42
+ def self.build_map( lines )
43
+ ## note: downcase name!!!
44
+ ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
45
+ ## {"january" => 1, "jan" => 1,
46
+ ## "february" => 2, "feb" => 2,
47
+ ## "march" => 3, "mar" => 3,
48
+ ## "april" => 4, "apr" => 4,
49
+ ## "may" => 5,
50
+ ## "june" => 6, "jun" => 6, ...
51
+ lines.each_with_index.reduce( {} ) do |h,(line,i)|
52
+ line.each { |name| h[ name.downcase ] = i+1 } ## note: start mapping with 1 (and NOT zero-based, that is, 0)
53
+ h
54
+ end
55
+ end
56
+
57
+
58
+ MONTH_LINES = parse_names( <<TXT )
59
+ January Jan
60
+ February Feb
61
+ March Mar
62
+ April Apr
63
+ May
64
+ June Jun
65
+ July Jul
66
+ August Aug
67
+ September Sept Sep
68
+ October Oct
69
+ November Nov
70
+ December Dec
71
+ TXT
72
+
73
+ MONTH_NAMES = build_names( MONTH_LINES )
74
+ # pp MONTH_NAMES
75
+ MONTH_MAP = build_map( MONTH_LINES )
76
+ # pp MONTH_MAP
77
+
78
+
79
+
80
+ DAY_LINES = parse_names( <<TXT )
81
+ Monday Mon Mo
82
+ Tuesday Tues Tue Tu
83
+ Wednesday Wed We
84
+ Thursday Thurs Thur Thu Th
85
+ Friday Fri Fr
86
+ Saturday Sat Sa
87
+ Sunday Sun Su
88
+ TXT
89
+
90
+ DAY_NAMES = build_names( DAY_LINES )
91
+ # pp DAY_NAMES
92
+ DAY_MAP = build_map( DAY_LINES )
93
+ # pp DAY_MAP
94
+
95
+
96
+ #=>
97
+ # "January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|
98
+ # July|Jul|August|Aug|September|Sept|Sep|October|Oct|
99
+ # November|Nov|December|Dec"
100
+ #
101
+ # "Monday|Mon|Mo|Tuesday|Tues|Tue|Tu|Wednesday|Wed|We|
102
+ # Thursday|Thurs|Thur|Thu|Th|Friday|Fri|Fr|
103
+ # Saturday|Sat|Sa|Sunday|Sun|Su"
104
+
105
+
106
+
107
+ ## todo - add more date variants !!!!
108
+
109
+ # e.g. Fri Aug/9 or Fri Aug 9
110
+ DATE_RE = %r{
111
+ (?<date>
112
+ \b
113
+ ## optional day name
114
+ ((?<day_name>#{DAY_NAMES})
115
+ [ ]
116
+ )?
117
+ (?<month_name>#{MONTH_NAMES})
118
+ (?: \/|[ ] )
119
+ (?<day>\d{1,2})
120
+ ## optional year
121
+ ( [ ]
122
+ (?<year>\d{4})
123
+ )?
124
+ \b
125
+ )}ix
126
+
127
+
128
+ ###
129
+ # date duration
130
+ # use - or + as separator
131
+ # in theory plus( +) only if dates
132
+ # are two days next to each other
133
+ #
134
+ # otherwise define new dates type in the future? why? why not?
135
+ #
136
+ # check for plus (+) if dates are next to each other (t+1) - why? why not?
137
+
138
+ #
139
+ # Sun Jun/23 - Wed Jun/26 -- YES
140
+ # Jun/23 - Jun/26 -- YES
141
+ # Tue Jun/25 + Wed Jun/26 -- YES
142
+ # Jun/25 + Jun/26 -- YES
143
+ #
144
+ # Jun/25 - 26 - why? why not???
145
+ # Jun/25 .. 26 - why? why not???
146
+ # Jun/25 to 26 - why? why not???
147
+ # Jun/25 + 26 - add - why? why not???
148
+ # Sun-Wed Jun/23-26 - add - why? why not???
149
+ # Wed+Thu Jun/26+27 2024 - add - why? why not???
150
+ #
151
+ # maybe use comman and plus for list of dates
152
+ # Tue Jun/25, Wed Jun/26, Thu Jun/27 ??
153
+ # Tue Jun/25 + Wed Jun/26 + Thu Jun/27 ??
154
+ #
155
+ # add back optional comma (before) year - why? why not?
156
+
157
+
158
+ DURATION_RE = %r{
159
+ (?<duration>
160
+ \b
161
+ ## optional day name
162
+ ((?<day_name1>#{DAY_NAMES})
163
+ [ ]
164
+ )?
165
+ (?<month_name1>#{MONTH_NAMES})
166
+ (?: \/|[ ] )
167
+ (?<day1>\d{1,2})
168
+ ## optional year
169
+ ( [ ]
170
+ (?<year1>\d{4})
171
+ )?
172
+
173
+ ## support + and - (add .. or such - why??)
174
+ [ ]*[+-][ ]*
175
+
176
+ ## optional day name
177
+ ((?<day_name2>#{DAY_NAMES})
178
+ [ ]
179
+ )?
180
+ (?<month_name2>#{MONTH_NAMES})
181
+ (?: \/|[ ] )
182
+ (?<day2>\d{1,2})
183
+ ## optional year
184
+ ( [ ]
185
+ (?<year2>\d{4})
186
+ )?
187
+ \b
188
+ )}ix
189
+
190
+
191
+ end # class Parser
192
+ end # module SportDb
193
+
@@ -0,0 +1,121 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+ ## todo/check: use ‹› (unicode chars) to mark optional parts in regex constant name - why? why not?
6
+
7
+ #####
8
+ # english helpers (penalty, extra time, ...)
9
+ ## note - p must go last (shortest match)
10
+ # pso = penalty shootout
11
+ P_EN = '(?: pso | pen\.? | p\.? )' # e.g. p., p, pen, pen., PSO, etc.
12
+ ET_EN = '(?: aet | a\.e\.t\.? )' # note: make last . optional (e.g a.e.t) allowed too
13
+
14
+
15
+ ## note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
16
+ ## 3-4 pen. 2-2 a.e.t.
17
+ ## 3-4 pen. 2-2 a.e.t.
18
+ ## 2-2 a.e.t.
19
+ SCORE__P_ET__RE = %r{
20
+ (?<score>
21
+ \b
22
+ (?:
23
+ (?<p1>\d{1,2}) - (?<p2>\d{1,2})
24
+ [ ]* #{P_EN} [ ]+
25
+ )? # note: make penalty (P) score optional for now
26
+ (?<et1>\d{1,2}) - (?<et2>\d{1,2})
27
+ [ ]* #{ET_EN}
28
+ (?=[ \]]|$)
29
+ )}ix
30
+ ## todo/check: remove loakahead assertion here - why require space?
31
+ ## note: \b works only after non-alphanum e.g. )
32
+
33
+
34
+ ## e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or
35
+ ## 3-4p 2-2aet (1-1, ) or
36
+ ## 3-4 pen. 2-2 a.e.t. (1-1) or
37
+ ## 2-2 a.e.t. (1-1, 1-1) or
38
+ ## 2-2 a.e.t. (1-1, ) or
39
+ ## 2-2 a.e.t. (1-1)
40
+
41
+ SCORE__P_ET_FT_HT__RE = %r{
42
+ (?<score>
43
+ \b
44
+ (?:
45
+ (?<p1>\d{1,2}) - (?<p2>\d{1,2})
46
+ [ ]* #{P_EN} [ ]+
47
+ )? # note: make penalty (P) score optional for now
48
+ (?<et1>\d{1,2}) - (?<et2>\d{1,2})
49
+ [ ]* #{ET_EN} [ ]+
50
+ \(
51
+ [ ]*
52
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
53
+ [ ]*
54
+ (?:
55
+ , [ ]*
56
+ (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
57
+ [ ]*
58
+ )?
59
+ )? # note: make half time (HT) score optional for now
60
+ \)
61
+ (?=[ \]]|$)
62
+ )}ix ## todo/check: remove loakahead assertion here - why require space?
63
+ ## note: \b works only after non-alphanum e.g. )
64
+
65
+ ###
66
+ ## special case for case WITHOUT extra time!!
67
+ ## same as above (but WITHOUT extra time and pen required)
68
+ SCORE__P_FT_HT__RE = %r{
69
+ (?<score>
70
+ \b
71
+ (?<p1>\d{1,2}) - (?<p2>\d{1,2})
72
+ [ ]* #{P_EN} [ ]+
73
+ \(
74
+ [ ]*
75
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
76
+ [ ]*
77
+ (?:
78
+ , [ ]*
79
+ (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
80
+ [ ]*
81
+ )?
82
+ )? # note: make half time (HT) score optional for now
83
+ \)
84
+ (?=[ \]]|$)
85
+ )}ix ## todo/check: remove loakahead assertion here - why require space?
86
+ ## note: \b works only after non-alphanum e.g. )
87
+
88
+
89
+
90
+ ## e.g. 2-1 (1-1) or
91
+ ## 2-1
92
+
93
+ SCORE__FT_HT__RE = %r{
94
+ (?<score>
95
+ \b
96
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
97
+ (?:
98
+ [ ]+ \( [ ]*
99
+ (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
100
+ [ ]* \)
101
+ )? # note: make half time (HT) score optional for now
102
+ (?=[ \]]|$)
103
+ )}ix ## todo/check: remove loakahead assertion here - why require space?
104
+ ## note: \b works only after non-alphanum e.g. )
105
+
106
+
107
+
108
+ #############################################
109
+ # map tables
110
+ # note: order matters; first come-first matched/served
111
+
112
+ SCORE_RE = Regexp.union(
113
+ SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
114
+ SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1)
115
+ SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t.
116
+ SCORE__FT_HT__RE # e.g. 1-1 (1-0)
117
+ )
118
+
119
+ end # class Parser
120
+ end # module SportDb
121
+
@@ -0,0 +1,114 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+ ## note - do NOT allow single alpha text for now
6
+ ## add later?? A - B C - D - why?
7
+ ## opt 1) one alpha
8
+ ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
+
10
+ ## opt 2) more than one alphanum
11
+
12
+
13
+ ### allow special case - starting text with number e.g.
14
+ ## number must be follow by space or dot ()
15
+ # 1 FC ## allow 1-FC or 1FC - why? why not?
16
+ # 1. FC
17
+ # 1.FC - XXXX - not allowed for now, parse error
18
+ # 1FC - XXXX - now allowed for now, parse error
19
+ # 1890 Munich
20
+ #
21
+
22
+
23
+ ##
24
+ # allow Cote'd Ivoir or such
25
+ ## e.g. add '
26
+
27
+
28
+ TEXT_RE = %r{
29
+ ## must start with alpha (allow unicode letters!!)
30
+ (?<text>
31
+ ## positive lookbehind
32
+ ## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
33
+ (?<=[ ,;@|\[\]]
34
+ |^
35
+ )
36
+ (?:
37
+ # opt 1 - start with alpha
38
+ \p{L}+ ## all unicode letters (e.g. [a-z])
39
+ |
40
+
41
+ # opt 2 - start with num!! - allow special case (e.g. 1. FC)
42
+ \d+ # check for num lookahead (MUST be space or dot)
43
+ ## MUST be followed by (optional dot) and
44
+ ## required space !!!
45
+ ## MUST be follow by a to z!!!!
46
+ \.? ## optional dot
47
+ [ ]? ## make space optional too - why? why not?
48
+ ## yes - eg. 1st, 2nd, 5th etc.
49
+ \p{L}+
50
+ )
51
+
52
+ (?:(?: (?:[ ]
53
+ (?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
54
+ )
55
+ | # only single spaces allowed inline!!!
56
+ [-]
57
+ )?
58
+ (?:
59
+ \p{L} |
60
+ [&/']
61
+ |
62
+ (?:
63
+ \d+
64
+ (?![0-9.:h'/+-])
65
+ ## negative lookahead for numbers
66
+ ## note - include digits itself!!!
67
+ )|
68
+ \.
69
+ )
70
+ )* ## must NOT end with space or dash(-)
71
+ ## todo/fix - possible in regex here
72
+ ## only end in alphanum a-z0-9 (not dot or & ???)
73
+
74
+
75
+ ## allow optional at the end
76
+ ## tag or year
77
+ ## make it and in the future - why? why not?
78
+ ##
79
+ ## (A) - allow with predined alpha only for now
80
+ ## e.g. (A) - amateur a team or b?
81
+ ## or U21 U9 etc. - why? why not?
82
+ ## or etc.
83
+ ## (1879-1893) or allow years e.g. (1879-1893)
84
+ ###
85
+ (?:
86
+ [ ]
87
+ \( (?:
88
+ A|B|
89
+ U\d{1,2}
90
+ )
91
+ \)
92
+ )?
93
+ (?:
94
+ [ ]
95
+ \(
96
+ \d{4}-\d{4}
97
+ \)
98
+ )?
99
+
100
+ ## add lookahead/lookbehind
101
+ ## must be space!!!
102
+ ## (or comma or start/end of string)
103
+ ## kind of \b !!!
104
+ ## positive lookahead
105
+ (?=[ ,;@|\[\]]
106
+ |$
107
+ )
108
+ )
109
+ }ix
110
+
111
+
112
+ end # class Parser
113
+ end # module SportDb
114
+