sportdb-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,196 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+ ## transforms
6
+ ##
7
+ ## Netherlands 1-2 (1-1) England
8
+ ## => text => team
9
+ ## score|vs
10
+ ## text => team
11
+
12
+
13
+ ## token iter/find better name
14
+ ## e.g. TokenBuffer/Scanner or such ??
15
+ class Tokens
16
+ def initialize( tokens )
17
+ @tokens = tokens
18
+ @pos = 0
19
+ end
20
+
21
+ def pos() @pos; end
22
+ def eos?() @pos >= @tokens.size; end
23
+
24
+
25
+ def include?( *types )
26
+ pos = @pos
27
+ ## puts " starting include? #{types.inspect} @ #{pos}"
28
+ while pos < @tokens.size do
29
+ return true if types.include?( @tokens[pos][0] )
30
+ pos +=1
31
+ end
32
+ false
33
+ end
34
+
35
+ ## pattern e.g. [:text, [:vs,:score], :text]
36
+ def match?( *pattern )
37
+ ## puts " starting match? #{pattern.inspect} @ #{@pos}"
38
+ pattern.each_with_index do |types,offset|
39
+ ## if single symbol wrap in array
40
+ types = types.is_a?(Array) ? types : [types]
41
+ return false unless types.include?( peek(offset) )
42
+ end
43
+ true
44
+ end
45
+
46
+
47
+ ## return token type (e.g. :text, :num, etc.)
48
+ def cur() peek(0); end
49
+ ## return content (assumed to be text)
50
+ def text(offset=0)
51
+ ## raise error - why? why not?
52
+ ## return nil?
53
+ if peek( offset ) != :text
54
+ raise ArgumentError, "text(#{offset}) - token not a text type"
55
+ end
56
+ @tokens[@pos+offset][1]
57
+ end
58
+
59
+
60
+ def peek(offset=1)
61
+ ## return nil if eos
62
+ if @pos+offset >= @tokens.size
63
+ nil
64
+ else
65
+ @tokens[@pos+offset][0]
66
+ end
67
+ end
68
+
69
+ ## note - returns complete token
70
+ def next
71
+ # if @pos >= @tokens.size
72
+ # raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
73
+ # end
74
+ # throw (standard) end of iteration here why? why not?
75
+
76
+ t = @tokens[@pos]
77
+ @pos += 1
78
+ t
79
+ end
80
+
81
+ def collect( &blk )
82
+ tokens = []
83
+ loop do
84
+ break if eos?
85
+ tokens << if block_given?
86
+ blk.call( self.next )
87
+ else
88
+ self.next
89
+ end
90
+ end
91
+ tokens
92
+ end
93
+ end # class Tokens
94
+
95
+
96
+
97
+ ##
98
+ ##
99
+ ## add !!!!
100
+ ## collect_until e.g. collect_until( :text )
101
+
102
+
103
+ def parse_with_errors( line, debug: false )
104
+ errors = []
105
+ tokens, token_errors = tokenize_with_errors( line, typed: true )
106
+ errors += token_errors
107
+
108
+ #############
109
+ ## pass 1
110
+ ## replace all texts with keyword matches (e.g. group, round, leg, etc.)
111
+ tokens = tokens.map do |t|
112
+ if t[0] == :text
113
+ text = t[1]
114
+ if is_group?( text )
115
+ [:group, text]
116
+ elsif is_leg?( text )
117
+ [:leg, text]
118
+ elsif is_round?( text )
119
+ [:round, text]
120
+ else
121
+ t ## pass through as-is (1:1)
122
+ end
123
+ else
124
+ t
125
+ end
126
+ end
127
+
128
+
129
+ ## puts "tokens:"
130
+ ## pp tokens
131
+
132
+ ## transform tokens into (parse tree/ast) nodes
133
+ nodes = []
134
+
135
+ buf = Tokens.new( tokens )
136
+ ## pp buf
137
+
138
+
139
+ loop do
140
+ if buf.pos == 0
141
+ ## check for
142
+ ## group def or round def
143
+ if buf.match?( :round, :'|' ) ## assume round def (change round to round_def)
144
+ nodes << [:round_def, buf.next[1]]
145
+ buf.next ## swallow pipe
146
+ nodes += buf.collect
147
+ break
148
+ end
149
+ if buf.match?( :group, :'|' ) ## assume group def (change group to group_def)
150
+ nodes << [:group_def, buf.next[1]]
151
+ buf.next ## swallow pipe
152
+ ## change all text to team
153
+ nodes += buf.collect { |t|
154
+ t[0] == :text ? [:team, t[1]] : t
155
+ }
156
+ break
157
+ end
158
+ end
159
+
160
+
161
+ if buf.match?( :text, [:score, :vs], :text )
162
+ nodes << [:team, buf.next[1]]
163
+ nodes << buf.next
164
+ nodes << [:team, buf.next[1]]
165
+ elsif buf.match?( :text, :minute )
166
+ nodes << [:player, buf.next[1]]
167
+ nodes << buf.next
168
+ elsif buf.cur == :'@'
169
+ ## add all to the end as is
170
+ ## only change text to geo
171
+ nodes += buf.collect { |t|
172
+ t[0] == :text ? [:geo, t[1]] : t
173
+ }
174
+ break
175
+ else
176
+ ## pass through
177
+ nodes << buf.next
178
+ end
179
+
180
+ break if buf.eos?
181
+ end
182
+
183
+ [nodes,errors]
184
+ end
185
+
186
+
187
+ ### convience helper - ignore errors by default
188
+ def parse( line, debug: false )
189
+ nodes, _ = parse_with_errors( line, debug: debug )
190
+ nodes
191
+ end
192
+
193
+
194
+ end # class Parser
195
+ end # module SportDb
196
+
@@ -0,0 +1,193 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+
6
+ def self.parse_names( txt )
7
+ lines = [] # array of lines (with words)
8
+
9
+ txt.each_line do |line|
10
+ line = line.strip
11
+
12
+ next if line.empty?
13
+ next if line.start_with?( '#' ) ## skip comments too
14
+
15
+ ## strip inline (until end-of-line) comments too
16
+ ## e.g. Janvier Janv Jan ## check janv in use??
17
+ ## => Janvier Janv Jan
18
+
19
+ line = line.sub( /#.*/, '' ).strip
20
+ ## pp line
21
+
22
+ values = line.split( /[ \t]+/ )
23
+ ## pp values
24
+
25
+ ## todo/fix -- add check for duplicates
26
+ lines << values
27
+ end
28
+ lines
29
+
30
+ end # method parse
31
+
32
+
33
+ def self.build_names( lines )
34
+ ## join all words together into a single string e.g.
35
+ ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
36
+ lines.map { |line| line.join('|') }.join('|')
37
+ end
38
+
39
+
40
+
41
+ ## add normalize option (for downcase) - why? why not?
42
+ def self.build_map( lines )
43
+ ## note: downcase name!!!
44
+ ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
45
+ ## {"january" => 1, "jan" => 1,
46
+ ## "february" => 2, "feb" => 2,
47
+ ## "march" => 3, "mar" => 3,
48
+ ## "april" => 4, "apr" => 4,
49
+ ## "may" => 5,
50
+ ## "june" => 6, "jun" => 6, ...
51
+ lines.each_with_index.reduce( {} ) do |h,(line,i)|
52
+ line.each { |name| h[ name.downcase ] = i+1 } ## note: start mapping with 1 (and NOT zero-based, that is, 0)
53
+ h
54
+ end
55
+ end
56
+
57
+
58
+ MONTH_LINES = parse_names( <<TXT )
59
+ January Jan
60
+ February Feb
61
+ March Mar
62
+ April Apr
63
+ May
64
+ June Jun
65
+ July Jul
66
+ August Aug
67
+ September Sept Sep
68
+ October Oct
69
+ November Nov
70
+ December Dec
71
+ TXT
72
+
73
+ MONTH_NAMES = build_names( MONTH_LINES )
74
+ # pp MONTH_NAMES
75
+ MONTH_MAP = build_map( MONTH_LINES )
76
+ # pp MONTH_MAP
77
+
78
+
79
+
80
+ DAY_LINES = parse_names( <<TXT )
81
+ Monday Mon Mo
82
+ Tuesday Tues Tue Tu
83
+ Wednesday Wed We
84
+ Thursday Thurs Thur Thu Th
85
+ Friday Fri Fr
86
+ Saturday Sat Sa
87
+ Sunday Sun Su
88
+ TXT
89
+
90
+ DAY_NAMES = build_names( DAY_LINES )
91
+ # pp DAY_NAMES
92
+ DAY_MAP = build_map( DAY_LINES )
93
+ # pp DAY_MAP
94
+
95
+
96
+ #=>
97
+ # "January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|
98
+ # July|Jul|August|Aug|September|Sept|Sep|October|Oct|
99
+ # November|Nov|December|Dec"
100
+ #
101
+ # "Monday|Mon|Mo|Tuesday|Tues|Tue|Tu|Wednesday|Wed|We|
102
+ # Thursday|Thurs|Thur|Thu|Th|Friday|Fri|Fr|
103
+ # Saturday|Sat|Sa|Sunday|Sun|Su"
104
+
105
+
106
+
107
+ ## todo - add more date variants !!!!
108
+
109
+ # e.g. Fri Aug/9 or Fri Aug 9
110
+ DATE_RE = %r{
111
+ (?<date>
112
+ \b
113
+ ## optional day name
114
+ ((?<day_name>#{DAY_NAMES})
115
+ [ ]
116
+ )?
117
+ (?<month_name>#{MONTH_NAMES})
118
+ (?: \/|[ ] )
119
+ (?<day>\d{1,2})
120
+ ## optional year
121
+ ( [ ]
122
+ (?<year>\d{4})
123
+ )?
124
+ \b
125
+ )}ix
126
+
127
+
128
+ ###
129
+ # date duration
130
+ # use - or + as separator
131
+ # in theory plus( +) only if dates
132
+ # are two days next to each other
133
+ #
134
+ # otherwise define new dates type in the future? why? why not?
135
+ #
136
+ # check for plus (+) if dates are next to each other (t+1) - why? why not?
137
+
138
+ #
139
+ # Sun Jun/23 - Wed Jun/26 -- YES
140
+ # Jun/23 - Jun/26 -- YES
141
+ # Tue Jun/25 + Wed Jun/26 -- YES
142
+ # Jun/25 + Jun/26 -- YES
143
+ #
144
+ # Jun/25 - 26 - why? why not???
145
+ # Jun/25 .. 26 - why? why not???
146
+ # Jun/25 to 26 - why? why not???
147
+ # Jun/25 + 26 - add - why? why not???
148
+ # Sun-Wed Jun/23-26 - add - why? why not???
149
+ # Wed+Thu Jun/26+27 2024 - add - why? why not???
150
+ #
151
+ # maybe use comman and plus for list of dates
152
+ # Tue Jun/25, Wed Jun/26, Thu Jun/27 ??
153
+ # Tue Jun/25 + Wed Jun/26 + Thu Jun/27 ??
154
+ #
155
+ # add back optional comma (before) year - why? why not?
156
+
157
+
158
+ DURATION_RE = %r{
159
+ (?<duration>
160
+ \b
161
+ ## optional day name
162
+ ((?<day_name1>#{DAY_NAMES})
163
+ [ ]
164
+ )?
165
+ (?<month_name1>#{MONTH_NAMES})
166
+ (?: \/|[ ] )
167
+ (?<day1>\d{1,2})
168
+ ## optional year
169
+ ( [ ]
170
+ (?<year1>\d{4})
171
+ )?
172
+
173
+ ## support + and - (add .. or such - why??)
174
+ [ ]*[+-][ ]*
175
+
176
+ ## optional day name
177
+ ((?<day_name2>#{DAY_NAMES})
178
+ [ ]
179
+ )?
180
+ (?<month_name2>#{MONTH_NAMES})
181
+ (?: \/|[ ] )
182
+ (?<day2>\d{1,2})
183
+ ## optional year
184
+ ( [ ]
185
+ (?<year2>\d{4})
186
+ )?
187
+ \b
188
+ )}ix
189
+
190
+
191
+ end # class Parser
192
+ end # module SportDb
193
+
@@ -0,0 +1,121 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+ ## todo/check: use ‹› (unicode chars) to mark optional parts in regex constant name - why? why not?
6
+
7
+ #####
8
+ # english helpers (penalty, extra time, ...)
9
+ ## note - p must go last (shortest match)
10
+ # pso = penalty shootout
11
+ P_EN = '(?: pso | pen\.? | p\.? )' # e.g. p., p, pen, pen., PSO, etc.
12
+ ET_EN = '(?: aet | a\.e\.t\.? )' # note: make last . optional (e.g a.e.t) allowed too
13
+
14
+
15
+ ## note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
16
+ ## 3-4 pen. 2-2 a.e.t.
17
+ ## 3-4 pen. 2-2 a.e.t.
18
+ ## 2-2 a.e.t.
19
+ SCORE__P_ET__RE = %r{
20
+ (?<score>
21
+ \b
22
+ (?:
23
+ (?<p1>\d{1,2}) - (?<p2>\d{1,2})
24
+ [ ]* #{P_EN} [ ]+
25
+ )? # note: make penalty (P) score optional for now
26
+ (?<et1>\d{1,2}) - (?<et2>\d{1,2})
27
+ [ ]* #{ET_EN}
28
+ (?=[ \]]|$)
29
+ )}ix
30
+ ## todo/check: remove loakahead assertion here - why require space?
31
+ ## note: \b works only after non-alphanum e.g. )
32
+
33
+
34
+ ## e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or
35
+ ## 3-4p 2-2aet (1-1, ) or
36
+ ## 3-4 pen. 2-2 a.e.t. (1-1) or
37
+ ## 2-2 a.e.t. (1-1, 1-1) or
38
+ ## 2-2 a.e.t. (1-1, ) or
39
+ ## 2-2 a.e.t. (1-1)
40
+
41
+ SCORE__P_ET_FT_HT__RE = %r{
42
+ (?<score>
43
+ \b
44
+ (?:
45
+ (?<p1>\d{1,2}) - (?<p2>\d{1,2})
46
+ [ ]* #{P_EN} [ ]+
47
+ )? # note: make penalty (P) score optional for now
48
+ (?<et1>\d{1,2}) - (?<et2>\d{1,2})
49
+ [ ]* #{ET_EN} [ ]+
50
+ \(
51
+ [ ]*
52
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
53
+ [ ]*
54
+ (?:
55
+ , [ ]*
56
+ (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
57
+ [ ]*
58
+ )?
59
+ )? # note: make half time (HT) score optional for now
60
+ \)
61
+ (?=[ \]]|$)
62
+ )}ix ## todo/check: remove loakahead assertion here - why require space?
63
+ ## note: \b works only after non-alphanum e.g. )
64
+
65
+ ###
66
+ ## special case for case WITHOUT extra time!!
67
+ ## same as above (but WITHOUT extra time and pen required)
68
+ SCORE__P_FT_HT__RE = %r{
69
+ (?<score>
70
+ \b
71
+ (?<p1>\d{1,2}) - (?<p2>\d{1,2})
72
+ [ ]* #{P_EN} [ ]+
73
+ \(
74
+ [ ]*
75
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
76
+ [ ]*
77
+ (?:
78
+ , [ ]*
79
+ (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
80
+ [ ]*
81
+ )?
82
+ )? # note: make half time (HT) score optional for now
83
+ \)
84
+ (?=[ \]]|$)
85
+ )}ix ## todo/check: remove loakahead assertion here - why require space?
86
+ ## note: \b works only after non-alphanum e.g. )
87
+
88
+
89
+
90
+ ## e.g. 2-1 (1-1) or
91
+ ## 2-1
92
+
93
+ SCORE__FT_HT__RE = %r{
94
+ (?<score>
95
+ \b
96
+ (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
97
+ (?:
98
+ [ ]+ \( [ ]*
99
+ (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
100
+ [ ]* \)
101
+ )? # note: make half time (HT) score optional for now
102
+ (?=[ \]]|$)
103
+ )}ix ## todo/check: remove loakahead assertion here - why require space?
104
+ ## note: \b works only after non-alphanum e.g. )
105
+
106
+
107
+
108
+ #############################################
109
+ # map tables
110
+ # note: order matters; first come-first matched/served
111
+
112
+ SCORE_RE = Regexp.union(
113
+ SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
114
+ SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1)
115
+ SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t.
116
+ SCORE__FT_HT__RE # e.g. 1-1 (1-0)
117
+ )
118
+
119
+ end # class Parser
120
+ end # module SportDb
121
+
@@ -0,0 +1,114 @@
1
+ module SportDb
2
+ class Parser
3
+
4
+
5
+ ## note - do NOT allow single alpha text for now
6
+ ## add later?? A - B C - D - why?
7
+ ## opt 1) one alpha
8
+ ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
+
10
+ ## opt 2) more than one alphanum
11
+
12
+
13
+ ### allow special case - starting text with number e.g.
14
+ ## number must be follow by space or dot ()
15
+ # 1 FC ## allow 1-FC or 1FC - why? why not?
16
+ # 1. FC
17
+ # 1.FC - XXXX - not allowed for now, parse error
18
+ # 1FC - XXXX - now allowed for now, parse error
19
+ # 1890 Munich
20
+ #
21
+
22
+
23
+ ##
24
+ # allow Cote'd Ivoir or such
25
+ ## e.g. add '
26
+
27
+
28
+ TEXT_RE = %r{
29
+ ## must start with alpha (allow unicode letters!!)
30
+ (?<text>
31
+ ## positive lookbehind
32
+ ## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
33
+ (?<=[ ,;@|\[\]]
34
+ |^
35
+ )
36
+ (?:
37
+ # opt 1 - start with alpha
38
+ \p{L}+ ## all unicode letters (e.g. [a-z])
39
+ |
40
+
41
+ # opt 2 - start with num!! - allow special case (e.g. 1. FC)
42
+ \d+ # check for num lookahead (MUST be space or dot)
43
+ ## MUST be followed by (optional dot) and
44
+ ## required space !!!
45
+ ## MUST be follow by a to z!!!!
46
+ \.? ## optional dot
47
+ [ ]? ## make space optional too - why? why not?
48
+ ## yes - eg. 1st, 2nd, 5th etc.
49
+ \p{L}+
50
+ )
51
+
52
+ (?:(?: (?:[ ]
53
+ (?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
54
+ )
55
+ | # only single spaces allowed inline!!!
56
+ [-]
57
+ )?
58
+ (?:
59
+ \p{L} |
60
+ [&/']
61
+ |
62
+ (?:
63
+ \d+
64
+ (?![0-9.:h'/+-])
65
+ ## negative lookahead for numbers
66
+ ## note - include digits itself!!!
67
+ )|
68
+ \.
69
+ )
70
+ )* ## must NOT end with space or dash(-)
71
+ ## todo/fix - possible in regex here
72
+ ## only end in alphanum a-z0-9 (not dot or & ???)
73
+
74
+
75
+ ## allow optional at the end
76
+ ## tag or year
77
+ ## make it and in the future - why? why not?
78
+ ##
79
+ ## (A) - allow with predined alpha only for now
80
+ ## e.g. (A) - amateur a team or b?
81
+ ## or U21 U9 etc. - why? why not?
82
+ ## or etc.
83
+ ## (1879-1893) or allow years e.g. (1879-1893)
84
+ ###
85
+ (?:
86
+ [ ]
87
+ \( (?:
88
+ A|B|
89
+ U\d{1,2}
90
+ )
91
+ \)
92
+ )?
93
+ (?:
94
+ [ ]
95
+ \(
96
+ \d{4}-\d{4}
97
+ \)
98
+ )?
99
+
100
+ ## add lookahead/lookbehind
101
+ ## must be space!!!
102
+ ## (or comma or start/end of string)
103
+ ## kind of \b !!!
104
+ ## positive lookahead
105
+ (?=[ ,;@|\[\]]
106
+ |$
107
+ )
108
+ )
109
+ }ix
110
+
111
+
112
+ end # class Parser
113
+ end # module SportDb
114
+