sportdb-parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +14 -0
- data/README.md +8 -0
- data/Rakefile +27 -0
- data/bin/fbt +144 -0
- data/lib/sportdb/parser/lang.rb +111 -0
- data/lib/sportdb/parser/linter.rb +153 -0
- data/lib/sportdb/parser/outline_reader.rb +101 -0
- data/lib/sportdb/parser/parser.rb +196 -0
- data/lib/sportdb/parser/token-date.rb +193 -0
- data/lib/sportdb/parser/token-score.rb +121 -0
- data/lib/sportdb/parser/token-text.rb +114 -0
- data/lib/sportdb/parser/token.rb +364 -0
- data/lib/sportdb/parser.rb +44 -0
- metadata +96 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
module SportDb
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
|
5
|
+
## transforms
|
6
|
+
##
|
7
|
+
## Netherlands 1-2 (1-1) England
|
8
|
+
## => text => team
|
9
|
+
## score|vs
|
10
|
+
## text => team
|
11
|
+
|
12
|
+
|
13
|
+
## token iter/find better name
|
14
|
+
## e.g. TokenBuffer/Scanner or such ??
|
15
|
+
class Tokens
|
16
|
+
def initialize( tokens )
|
17
|
+
@tokens = tokens
|
18
|
+
@pos = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
def pos() @pos; end
|
22
|
+
def eos?() @pos >= @tokens.size; end
|
23
|
+
|
24
|
+
|
25
|
+
def include?( *types )
|
26
|
+
pos = @pos
|
27
|
+
## puts " starting include? #{types.inspect} @ #{pos}"
|
28
|
+
while pos < @tokens.size do
|
29
|
+
return true if types.include?( @tokens[pos][0] )
|
30
|
+
pos +=1
|
31
|
+
end
|
32
|
+
false
|
33
|
+
end
|
34
|
+
|
35
|
+
## pattern e.g. [:text, [:vs,:score], :text]
|
36
|
+
def match?( *pattern )
|
37
|
+
## puts " starting match? #{pattern.inspect} @ #{@pos}"
|
38
|
+
pattern.each_with_index do |types,offset|
|
39
|
+
## if single symbol wrap in array
|
40
|
+
types = types.is_a?(Array) ? types : [types]
|
41
|
+
return false unless types.include?( peek(offset) )
|
42
|
+
end
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
## return token type (e.g. :text, :num, etc.)
|
48
|
+
def cur() peek(0); end
|
49
|
+
## return content (assumed to be text)
|
50
|
+
def text(offset=0)
|
51
|
+
## raise error - why? why not?
|
52
|
+
## return nil?
|
53
|
+
if peek( offset ) != :text
|
54
|
+
raise ArgumentError, "text(#{offset}) - token not a text type"
|
55
|
+
end
|
56
|
+
@tokens[@pos+offset][1]
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def peek(offset=1)
|
61
|
+
## return nil if eos
|
62
|
+
if @pos+offset >= @tokens.size
|
63
|
+
nil
|
64
|
+
else
|
65
|
+
@tokens[@pos+offset][0]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
## note - returns complete token
|
70
|
+
def next
|
71
|
+
# if @pos >= @tokens.size
|
72
|
+
# raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
|
73
|
+
# end
|
74
|
+
# throw (standard) end of iteration here why? why not?
|
75
|
+
|
76
|
+
t = @tokens[@pos]
|
77
|
+
@pos += 1
|
78
|
+
t
|
79
|
+
end
|
80
|
+
|
81
|
+
def collect( &blk )
|
82
|
+
tokens = []
|
83
|
+
loop do
|
84
|
+
break if eos?
|
85
|
+
tokens << if block_given?
|
86
|
+
blk.call( self.next )
|
87
|
+
else
|
88
|
+
self.next
|
89
|
+
end
|
90
|
+
end
|
91
|
+
tokens
|
92
|
+
end
|
93
|
+
end # class Tokens
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
##
|
98
|
+
##
|
99
|
+
## add !!!!
|
100
|
+
## collect_until e.g. collect_until( :text )
|
101
|
+
|
102
|
+
|
103
|
+
def parse_with_errors( line, debug: false )
|
104
|
+
errors = []
|
105
|
+
tokens, token_errors = tokenize_with_errors( line, typed: true )
|
106
|
+
errors += token_errors
|
107
|
+
|
108
|
+
#############
|
109
|
+
## pass 1
|
110
|
+
## replace all texts with keyword matches (e.g. group, round, leg, etc.)
|
111
|
+
tokens = tokens.map do |t|
|
112
|
+
if t[0] == :text
|
113
|
+
text = t[1]
|
114
|
+
if is_group?( text )
|
115
|
+
[:group, text]
|
116
|
+
elsif is_leg?( text )
|
117
|
+
[:leg, text]
|
118
|
+
elsif is_round?( text )
|
119
|
+
[:round, text]
|
120
|
+
else
|
121
|
+
t ## pass through as-is (1:1)
|
122
|
+
end
|
123
|
+
else
|
124
|
+
t
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
## puts "tokens:"
|
130
|
+
## pp tokens
|
131
|
+
|
132
|
+
## transform tokens into (parse tree/ast) nodes
|
133
|
+
nodes = []
|
134
|
+
|
135
|
+
buf = Tokens.new( tokens )
|
136
|
+
## pp buf
|
137
|
+
|
138
|
+
|
139
|
+
loop do
|
140
|
+
if buf.pos == 0
|
141
|
+
## check for
|
142
|
+
## group def or round def
|
143
|
+
if buf.match?( :round, :'|' ) ## assume round def (change round to round_def)
|
144
|
+
nodes << [:round_def, buf.next[1]]
|
145
|
+
buf.next ## swallow pipe
|
146
|
+
nodes += buf.collect
|
147
|
+
break
|
148
|
+
end
|
149
|
+
if buf.match?( :group, :'|' ) ## assume group def (change group to group_def)
|
150
|
+
nodes << [:group_def, buf.next[1]]
|
151
|
+
buf.next ## swallow pipe
|
152
|
+
## change all text to team
|
153
|
+
nodes += buf.collect { |t|
|
154
|
+
t[0] == :text ? [:team, t[1]] : t
|
155
|
+
}
|
156
|
+
break
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
if buf.match?( :text, [:score, :vs], :text )
|
162
|
+
nodes << [:team, buf.next[1]]
|
163
|
+
nodes << buf.next
|
164
|
+
nodes << [:team, buf.next[1]]
|
165
|
+
elsif buf.match?( :text, :minute )
|
166
|
+
nodes << [:player, buf.next[1]]
|
167
|
+
nodes << buf.next
|
168
|
+
elsif buf.cur == :'@'
|
169
|
+
## add all to the end as is
|
170
|
+
## only change text to geo
|
171
|
+
nodes += buf.collect { |t|
|
172
|
+
t[0] == :text ? [:geo, t[1]] : t
|
173
|
+
}
|
174
|
+
break
|
175
|
+
else
|
176
|
+
## pass through
|
177
|
+
nodes << buf.next
|
178
|
+
end
|
179
|
+
|
180
|
+
break if buf.eos?
|
181
|
+
end
|
182
|
+
|
183
|
+
[nodes,errors]
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
### convience helper - ignore errors by default
|
188
|
+
def parse( line, debug: false )
|
189
|
+
nodes, _ = parse_with_errors( line, debug: debug )
|
190
|
+
nodes
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
end # class Parser
|
195
|
+
end # module SportDb
|
196
|
+
|
@@ -0,0 +1,193 @@
|
|
1
|
+
module SportDb
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
def self.parse_names( txt )
|
7
|
+
lines = [] # array of lines (with words)
|
8
|
+
|
9
|
+
txt.each_line do |line|
|
10
|
+
line = line.strip
|
11
|
+
|
12
|
+
next if line.empty?
|
13
|
+
next if line.start_with?( '#' ) ## skip comments too
|
14
|
+
|
15
|
+
## strip inline (until end-of-line) comments too
|
16
|
+
## e.g. Janvier Janv Jan ## check janv in use??
|
17
|
+
## => Janvier Janv Jan
|
18
|
+
|
19
|
+
line = line.sub( /#.*/, '' ).strip
|
20
|
+
## pp line
|
21
|
+
|
22
|
+
values = line.split( /[ \t]+/ )
|
23
|
+
## pp values
|
24
|
+
|
25
|
+
## todo/fix -- add check for duplicates
|
26
|
+
lines << values
|
27
|
+
end
|
28
|
+
lines
|
29
|
+
|
30
|
+
end # method parse
|
31
|
+
|
32
|
+
|
33
|
+
def self.build_names( lines )
|
34
|
+
## join all words together into a single string e.g.
|
35
|
+
## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
|
36
|
+
lines.map { |line| line.join('|') }.join('|')
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
## add normalize option (for downcase) - why? why not?
|
42
|
+
def self.build_map( lines )
|
43
|
+
## note: downcase name!!!
|
44
|
+
## build a lookup map that maps the word to the index (line no) plus 1 e.g.
|
45
|
+
## {"january" => 1, "jan" => 1,
|
46
|
+
## "february" => 2, "feb" => 2,
|
47
|
+
## "march" => 3, "mar" => 3,
|
48
|
+
## "april" => 4, "apr" => 4,
|
49
|
+
## "may" => 5,
|
50
|
+
## "june" => 6, "jun" => 6, ...
|
51
|
+
lines.each_with_index.reduce( {} ) do |h,(line,i)|
|
52
|
+
line.each { |name| h[ name.downcase ] = i+1 } ## note: start mapping with 1 (and NOT zero-based, that is, 0)
|
53
|
+
h
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
MONTH_LINES = parse_names( <<TXT )
|
59
|
+
January Jan
|
60
|
+
February Feb
|
61
|
+
March Mar
|
62
|
+
April Apr
|
63
|
+
May
|
64
|
+
June Jun
|
65
|
+
July Jul
|
66
|
+
August Aug
|
67
|
+
September Sept Sep
|
68
|
+
October Oct
|
69
|
+
November Nov
|
70
|
+
December Dec
|
71
|
+
TXT
|
72
|
+
|
73
|
+
MONTH_NAMES = build_names( MONTH_LINES )
|
74
|
+
# pp MONTH_NAMES
|
75
|
+
MONTH_MAP = build_map( MONTH_LINES )
|
76
|
+
# pp MONTH_MAP
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
DAY_LINES = parse_names( <<TXT )
|
81
|
+
Monday Mon Mo
|
82
|
+
Tuesday Tues Tue Tu
|
83
|
+
Wednesday Wed We
|
84
|
+
Thursday Thurs Thur Thu Th
|
85
|
+
Friday Fri Fr
|
86
|
+
Saturday Sat Sa
|
87
|
+
Sunday Sun Su
|
88
|
+
TXT
|
89
|
+
|
90
|
+
DAY_NAMES = build_names( DAY_LINES )
|
91
|
+
# pp DAY_NAMES
|
92
|
+
DAY_MAP = build_map( DAY_LINES )
|
93
|
+
# pp DAY_MAP
|
94
|
+
|
95
|
+
|
96
|
+
#=>
|
97
|
+
# "January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|
|
98
|
+
# July|Jul|August|Aug|September|Sept|Sep|October|Oct|
|
99
|
+
# November|Nov|December|Dec"
|
100
|
+
#
|
101
|
+
# "Monday|Mon|Mo|Tuesday|Tues|Tue|Tu|Wednesday|Wed|We|
|
102
|
+
# Thursday|Thurs|Thur|Thu|Th|Friday|Fri|Fr|
|
103
|
+
# Saturday|Sat|Sa|Sunday|Sun|Su"
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
## todo - add more date variants !!!!
|
108
|
+
|
109
|
+
# e.g. Fri Aug/9 or Fri Aug 9
|
110
|
+
DATE_RE = %r{
|
111
|
+
(?<date>
|
112
|
+
\b
|
113
|
+
## optional day name
|
114
|
+
((?<day_name>#{DAY_NAMES})
|
115
|
+
[ ]
|
116
|
+
)?
|
117
|
+
(?<month_name>#{MONTH_NAMES})
|
118
|
+
(?: \/|[ ] )
|
119
|
+
(?<day>\d{1,2})
|
120
|
+
## optional year
|
121
|
+
( [ ]
|
122
|
+
(?<year>\d{4})
|
123
|
+
)?
|
124
|
+
\b
|
125
|
+
)}ix
|
126
|
+
|
127
|
+
|
128
|
+
###
|
129
|
+
# date duration
|
130
|
+
# use - or + as separator
|
131
|
+
# in theory plus( +) only if dates
|
132
|
+
# are two days next to each other
|
133
|
+
#
|
134
|
+
# otherwise define new dates type in the future? why? why not?
|
135
|
+
#
|
136
|
+
# check for plus (+) if dates are next to each other (t+1) - why? why not?
|
137
|
+
|
138
|
+
#
|
139
|
+
# Sun Jun/23 - Wed Jun/26 -- YES
|
140
|
+
# Jun/23 - Jun/26 -- YES
|
141
|
+
# Tue Jun/25 + Wed Jun/26 -- YES
|
142
|
+
# Jun/25 + Jun/26 -- YES
|
143
|
+
#
|
144
|
+
# Jun/25 - 26 - why? why not???
|
145
|
+
# Jun/25 .. 26 - why? why not???
|
146
|
+
# Jun/25 to 26 - why? why not???
|
147
|
+
# Jun/25 + 26 - add - why? why not???
|
148
|
+
# Sun-Wed Jun/23-26 - add - why? why not???
|
149
|
+
# Wed+Thu Jun/26+27 2024 - add - why? why not???
|
150
|
+
#
|
151
|
+
# maybe use comman and plus for list of dates
|
152
|
+
# Tue Jun/25, Wed Jun/26, Thu Jun/27 ??
|
153
|
+
# Tue Jun/25 + Wed Jun/26 + Thu Jun/27 ??
|
154
|
+
#
|
155
|
+
# add back optional comma (before) year - why? why not?
|
156
|
+
|
157
|
+
|
158
|
+
DURATION_RE = %r{
|
159
|
+
(?<duration>
|
160
|
+
\b
|
161
|
+
## optional day name
|
162
|
+
((?<day_name1>#{DAY_NAMES})
|
163
|
+
[ ]
|
164
|
+
)?
|
165
|
+
(?<month_name1>#{MONTH_NAMES})
|
166
|
+
(?: \/|[ ] )
|
167
|
+
(?<day1>\d{1,2})
|
168
|
+
## optional year
|
169
|
+
( [ ]
|
170
|
+
(?<year1>\d{4})
|
171
|
+
)?
|
172
|
+
|
173
|
+
## support + and - (add .. or such - why??)
|
174
|
+
[ ]*[+-][ ]*
|
175
|
+
|
176
|
+
## optional day name
|
177
|
+
((?<day_name2>#{DAY_NAMES})
|
178
|
+
[ ]
|
179
|
+
)?
|
180
|
+
(?<month_name2>#{MONTH_NAMES})
|
181
|
+
(?: \/|[ ] )
|
182
|
+
(?<day2>\d{1,2})
|
183
|
+
## optional year
|
184
|
+
( [ ]
|
185
|
+
(?<year2>\d{4})
|
186
|
+
)?
|
187
|
+
\b
|
188
|
+
)}ix
|
189
|
+
|
190
|
+
|
191
|
+
end # class Parser
|
192
|
+
end # module SportDb
|
193
|
+
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module SportDb
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
|
5
|
+
## todo/check: use ‹› (unicode chars) to mark optional parts in regex constant name - why? why not?
|
6
|
+
|
7
|
+
#####
|
8
|
+
# english helpers (penalty, extra time, ...)
|
9
|
+
## note - p must go last (shortest match)
|
10
|
+
# pso = penalty shootout
|
11
|
+
P_EN = '(?: pso | pen\.? | p\.? )' # e.g. p., p, pen, pen., PSO, etc.
|
12
|
+
ET_EN = '(?: aet | a\.e\.t\.? )' # note: make last . optional (e.g a.e.t) allowed too
|
13
|
+
|
14
|
+
|
15
|
+
## note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
|
16
|
+
## 3-4 pen. 2-2 a.e.t.
|
17
|
+
## 3-4 pen. 2-2 a.e.t.
|
18
|
+
## 2-2 a.e.t.
|
19
|
+
SCORE__P_ET__RE = %r{
|
20
|
+
(?<score>
|
21
|
+
\b
|
22
|
+
(?:
|
23
|
+
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
24
|
+
[ ]* #{P_EN} [ ]+
|
25
|
+
)? # note: make penalty (P) score optional for now
|
26
|
+
(?<et1>\d{1,2}) - (?<et2>\d{1,2})
|
27
|
+
[ ]* #{ET_EN}
|
28
|
+
(?=[ \]]|$)
|
29
|
+
)}ix
|
30
|
+
## todo/check: remove loakahead assertion here - why require space?
|
31
|
+
## note: \b works only after non-alphanum e.g. )
|
32
|
+
|
33
|
+
|
34
|
+
## e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or
|
35
|
+
## 3-4p 2-2aet (1-1, ) or
|
36
|
+
## 3-4 pen. 2-2 a.e.t. (1-1) or
|
37
|
+
## 2-2 a.e.t. (1-1, 1-1) or
|
38
|
+
## 2-2 a.e.t. (1-1, ) or
|
39
|
+
## 2-2 a.e.t. (1-1)
|
40
|
+
|
41
|
+
SCORE__P_ET_FT_HT__RE = %r{
|
42
|
+
(?<score>
|
43
|
+
\b
|
44
|
+
(?:
|
45
|
+
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
46
|
+
[ ]* #{P_EN} [ ]+
|
47
|
+
)? # note: make penalty (P) score optional for now
|
48
|
+
(?<et1>\d{1,2}) - (?<et2>\d{1,2})
|
49
|
+
[ ]* #{ET_EN} [ ]+
|
50
|
+
\(
|
51
|
+
[ ]*
|
52
|
+
(?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
|
53
|
+
[ ]*
|
54
|
+
(?:
|
55
|
+
, [ ]*
|
56
|
+
(?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
|
57
|
+
[ ]*
|
58
|
+
)?
|
59
|
+
)? # note: make half time (HT) score optional for now
|
60
|
+
\)
|
61
|
+
(?=[ \]]|$)
|
62
|
+
)}ix ## todo/check: remove loakahead assertion here - why require space?
|
63
|
+
## note: \b works only after non-alphanum e.g. )
|
64
|
+
|
65
|
+
###
|
66
|
+
## special case for case WITHOUT extra time!!
|
67
|
+
## same as above (but WITHOUT extra time and pen required)
|
68
|
+
SCORE__P_FT_HT__RE = %r{
|
69
|
+
(?<score>
|
70
|
+
\b
|
71
|
+
(?<p1>\d{1,2}) - (?<p2>\d{1,2})
|
72
|
+
[ ]* #{P_EN} [ ]+
|
73
|
+
\(
|
74
|
+
[ ]*
|
75
|
+
(?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
|
76
|
+
[ ]*
|
77
|
+
(?:
|
78
|
+
, [ ]*
|
79
|
+
(?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
|
80
|
+
[ ]*
|
81
|
+
)?
|
82
|
+
)? # note: make half time (HT) score optional for now
|
83
|
+
\)
|
84
|
+
(?=[ \]]|$)
|
85
|
+
)}ix ## todo/check: remove loakahead assertion here - why require space?
|
86
|
+
## note: \b works only after non-alphanum e.g. )
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
## e.g. 2-1 (1-1) or
|
91
|
+
## 2-1
|
92
|
+
|
93
|
+
SCORE__FT_HT__RE = %r{
|
94
|
+
(?<score>
|
95
|
+
\b
|
96
|
+
(?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
|
97
|
+
(?:
|
98
|
+
[ ]+ \( [ ]*
|
99
|
+
(?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
|
100
|
+
[ ]* \)
|
101
|
+
)? # note: make half time (HT) score optional for now
|
102
|
+
(?=[ \]]|$)
|
103
|
+
)}ix ## todo/check: remove loakahead assertion here - why require space?
|
104
|
+
## note: \b works only after non-alphanum e.g. )
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
#############################################
|
109
|
+
# map tables
|
110
|
+
# note: order matters; first come-first matched/served
|
111
|
+
|
112
|
+
SCORE_RE = Regexp.union(
|
113
|
+
SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
|
114
|
+
SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1)
|
115
|
+
SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t.
|
116
|
+
SCORE__FT_HT__RE # e.g. 1-1 (1-0)
|
117
|
+
)
|
118
|
+
|
119
|
+
end # class Parser
|
120
|
+
end # module SportDb
|
121
|
+
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module SportDb
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
|
5
|
+
## note - do NOT allow single alpha text for now
|
6
|
+
## add later?? A - B C - D - why?
|
7
|
+
## opt 1) one alpha
|
8
|
+
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
+
|
10
|
+
## opt 2) more than one alphanum
|
11
|
+
|
12
|
+
|
13
|
+
### allow special case - starting text with number e.g.
|
14
|
+
## number must be follow by space or dot ()
|
15
|
+
# 1 FC ## allow 1-FC or 1FC - why? why not?
|
16
|
+
# 1. FC
|
17
|
+
# 1.FC - XXXX - not allowed for now, parse error
|
18
|
+
# 1FC - XXXX - now allowed for now, parse error
|
19
|
+
# 1890 Munich
|
20
|
+
#
|
21
|
+
|
22
|
+
|
23
|
+
##
|
24
|
+
# allow Cote'd Ivoir or such
|
25
|
+
## e.g. add '
|
26
|
+
|
27
|
+
|
28
|
+
TEXT_RE = %r{
|
29
|
+
## must start with alpha (allow unicode letters!!)
|
30
|
+
(?<text>
|
31
|
+
## positive lookbehind
|
32
|
+
## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
|
33
|
+
(?<=[ ,;@|\[\]]
|
34
|
+
|^
|
35
|
+
)
|
36
|
+
(?:
|
37
|
+
# opt 1 - start with alpha
|
38
|
+
\p{L}+ ## all unicode letters (e.g. [a-z])
|
39
|
+
|
|
40
|
+
|
41
|
+
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
42
|
+
\d+ # check for num lookahead (MUST be space or dot)
|
43
|
+
## MUST be followed by (optional dot) and
|
44
|
+
## required space !!!
|
45
|
+
## MUST be follow by a to z!!!!
|
46
|
+
\.? ## optional dot
|
47
|
+
[ ]? ## make space optional too - why? why not?
|
48
|
+
## yes - eg. 1st, 2nd, 5th etc.
|
49
|
+
\p{L}+
|
50
|
+
)
|
51
|
+
|
52
|
+
(?:(?: (?:[ ]
|
53
|
+
(?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
|
54
|
+
)
|
55
|
+
| # only single spaces allowed inline!!!
|
56
|
+
[-]
|
57
|
+
)?
|
58
|
+
(?:
|
59
|
+
\p{L} |
|
60
|
+
[&/']
|
61
|
+
|
|
62
|
+
(?:
|
63
|
+
\d+
|
64
|
+
(?![0-9.:h'/+-])
|
65
|
+
## negative lookahead for numbers
|
66
|
+
## note - include digits itself!!!
|
67
|
+
)|
|
68
|
+
\.
|
69
|
+
)
|
70
|
+
)* ## must NOT end with space or dash(-)
|
71
|
+
## todo/fix - possible in regex here
|
72
|
+
## only end in alphanum a-z0-9 (not dot or & ???)
|
73
|
+
|
74
|
+
|
75
|
+
## allow optional at the end
|
76
|
+
## tag or year
|
77
|
+
## make it and in the future - why? why not?
|
78
|
+
##
|
79
|
+
## (A) - allow with predined alpha only for now
|
80
|
+
## e.g. (A) - amateur a team or b?
|
81
|
+
## or U21 U9 etc. - why? why not?
|
82
|
+
## or etc.
|
83
|
+
## (1879-1893) or allow years e.g. (1879-1893)
|
84
|
+
###
|
85
|
+
(?:
|
86
|
+
[ ]
|
87
|
+
\( (?:
|
88
|
+
A|B|
|
89
|
+
U\d{1,2}
|
90
|
+
)
|
91
|
+
\)
|
92
|
+
)?
|
93
|
+
(?:
|
94
|
+
[ ]
|
95
|
+
\(
|
96
|
+
\d{4}-\d{4}
|
97
|
+
\)
|
98
|
+
)?
|
99
|
+
|
100
|
+
## add lookahead/lookbehind
|
101
|
+
## must be space!!!
|
102
|
+
## (or comma or start/end of string)
|
103
|
+
## kind of \b !!!
|
104
|
+
## positive lookahead
|
105
|
+
(?=[ ,;@|\[\]]
|
106
|
+
|$
|
107
|
+
)
|
108
|
+
)
|
109
|
+
}ix
|
110
|
+
|
111
|
+
|
112
|
+
end # class Parser
|
113
|
+
end # module SportDb
|
114
|
+
|