sportdb-parser 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +17 -4
- data/lib/sportdb/parser/lexer-on_goal.rb +172 -0
- data/lib/sportdb/parser/lexer-on_group_def.rb +31 -0
- data/lib/sportdb/parser/lexer-on_prop_lineup.rb +79 -0
- data/lib/sportdb/parser/lexer-on_prop_misc.rb +110 -0
- data/lib/sportdb/parser/lexer-on_prop_penalties.rb +40 -0
- data/lib/sportdb/parser/lexer-on_round_def.rb +37 -0
- data/lib/sportdb/parser/lexer-on_top.rb +125 -0
- data/lib/sportdb/parser/lexer-prep_doc.rb +131 -0
- data/lib/sportdb/parser/lexer-prep_line.rb +63 -0
- data/lib/sportdb/parser/lexer-tokenize.rb +449 -0
- data/lib/sportdb/parser/lexer.rb +133 -1363
- data/lib/sportdb/parser/lexer_buffer.rb +8 -37
- data/lib/sportdb/parser/lexer_token.rb +126 -0
- data/lib/sportdb/parser/parser.rb +1104 -1403
- data/lib/sportdb/parser/racc_parser.rb +36 -32
- data/lib/sportdb/parser/racc_tree.rb +65 -98
- data/lib/sportdb/parser/token-date--helpers.rb +130 -0
- data/lib/sportdb/parser/token-date--names.rb +108 -0
- data/lib/sportdb/parser/token-date.rb +20 -192
- data/lib/sportdb/parser/token-date_duration.rb +8 -27
- data/lib/sportdb/parser/token-geo.rb +16 -16
- data/lib/sportdb/parser/token-goals--helpers.rb +114 -0
- data/lib/sportdb/parser/token-goals.rb +103 -249
- data/lib/sportdb/parser/token-group.rb +8 -22
- data/lib/sportdb/parser/token-prop.rb +138 -124
- data/lib/sportdb/parser/token-prop_name.rb +48 -39
- data/lib/sportdb/parser/token-round.rb +21 -35
- data/lib/sportdb/parser/token-score--helpers.rb +189 -0
- data/lib/sportdb/parser/token-score.rb +9 -393
- data/lib/sportdb/parser/token-score_full.rb +331 -0
- data/lib/sportdb/parser/token-status.rb +44 -46
- data/lib/sportdb/parser/token-status_inline.rb +112 -0
- data/lib/sportdb/parser/token-text.rb +41 -31
- data/lib/sportdb/parser/token-time.rb +29 -26
- data/lib/sportdb/parser/token.rb +58 -159
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +45 -17
- metadata +19 -6
- data/lib/sportdb/parser/blocktxt.rb +0 -99
- data/lib/sportdb/parser/lexer_tty.rb +0 -111
- data/lib/sportdb/parser/token-table.rb +0 -149
- data/lib/sportdb/parser/token_helpers.rb +0 -92
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
## team prop mode e.g.
|
|
3
3
|
##
|
|
4
4
|
##
|
|
5
|
-
## Fri Jun 14 21:00 @ München Fußball Arena, München
|
|
6
|
-
## Germany v Scotland 5-1 (3-0)
|
|
5
|
+
## Fri Jun 14 21:00 @ München Fußball Arena, München
|
|
6
|
+
## Germany v Scotland 5-1 (3-0)
|
|
7
7
|
## (Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.))
|
|
8
|
-
##
|
|
8
|
+
##
|
|
9
9
|
## Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
|
|
10
|
-
## Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
|
|
10
|
+
## Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
|
|
11
11
|
## Havertz (Füllkrug 63')
|
|
12
12
|
## Scotland: Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
|
|
13
13
|
## McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
|
|
@@ -20,7 +20,7 @@ class Lexer
|
|
|
20
20
|
|
|
21
21
|
##############
|
|
22
22
|
# add support for props/ attributes e.g.
|
|
23
|
-
#
|
|
23
|
+
#
|
|
24
24
|
# Germany: Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
|
|
25
25
|
# Kroos (80' Can) - Musiala (74' Müller), Gündogan,
|
|
26
26
|
# Wirtz (63' Sane) - Havertz (63' Füllkrug)
|
|
@@ -35,54 +35,123 @@ class Lexer
|
|
|
35
35
|
## limit to 30 chars max
|
|
36
36
|
## only allow chars incl. intl but (NOT ()[]/;)
|
|
37
37
|
##
|
|
38
|
-
## todo/fix:
|
|
39
|
-
## check if St. Pölten works; with starting St. ???
|
|
40
38
|
##
|
|
41
39
|
## note - use special \G - Matches first matching position !!!!
|
|
40
|
+
## check for \G like backreference of regex tokens/parts if possible/available in ruby?
|
|
42
41
|
|
|
43
|
-
###
|
|
44
|
-
## todo/fix/fix
|
|
45
|
-
## change ^ to \A
|
|
46
|
-
## change name to START_WITH_PROP_KEY_RE !!!
|
|
47
42
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
43
|
+
|
|
44
|
+
## (i) starting w/ letters
|
|
45
|
+
## note - incl./allows digits (0-9)
|
|
46
|
+
## e.g. a1, a2000, etc.
|
|
47
|
+
##
|
|
48
|
+
## note - added back optional trailing dot (.) for abbrev. word !!!
|
|
49
|
+
PROP_KEY_WORD_ = %r{
|
|
50
|
+
\p{L}
|
|
51
|
+
[\p{L}\d]*
|
|
52
|
+
\.?
|
|
53
|
+
}ix
|
|
54
|
+
|
|
55
|
+
## note - incl. optional dot or numsign e.g. 1. or 1°
|
|
56
|
+
PROP_KEY_NUM_ = %r{
|
|
57
|
+
\d+
|
|
58
|
+
[.°]?
|
|
59
|
+
}ix
|
|
60
|
+
|
|
61
|
+
## e.g. 1A, 1FC etc.
|
|
62
|
+
## note - no trailing dot (.) for now - check if any cases exist in real world
|
|
63
|
+
PROP_KEY_NUMALPHA_ = %r{
|
|
64
|
+
\d+
|
|
65
|
+
\p{L}
|
|
66
|
+
[\p{L}\d]*
|
|
67
|
+
}ix
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
START_WITH_PROP_KEY_RE = %r{
|
|
74
|
+
\A ## note - MUST start line; leading spaces optional (eat-up)
|
|
51
75
|
(?<prop_key>
|
|
76
|
+
[ ]* ## optional leading spaces
|
|
52
77
|
(?<key>
|
|
53
|
-
(
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
78
|
+
(?:
|
|
79
|
+
## (i) starting w/ letters
|
|
80
|
+
#{PROP_KEY_WORD_}
|
|
81
|
+
|
|
82
|
+
## (ii) starting w/ number
|
|
83
|
+
## e.g. 1fc, 1a,
|
|
84
|
+
| #{PROP_KEY_NUMALPHA_}
|
|
85
|
+
## followed by optional dot) and
|
|
86
|
+
## optional space
|
|
87
|
+
## MUST be follow by letter (a to z)!!!!
|
|
88
|
+
## eg. 1[ fc], 1.[ fc], 1.[fc], etc.
|
|
89
|
+
| #{PROP_KEY_NUM_} (?= [ ]? \p{L})
|
|
90
|
+
)
|
|
91
|
+
(?:
|
|
92
|
+
## connectors - note - no dot (.), must match with abbrev word or num!!
|
|
93
|
+
(?: ## (i) single space or WITHOUT surrounding spaces!! - slash (/), dash (-)
|
|
94
|
+
## e.g. do NOT match one - two or one / two
|
|
95
|
+
## only one-two or one/two
|
|
96
|
+
|
|
97
|
+
[ /-]
|
|
98
|
+
|
|
99
|
+
## (ii) surrounded by leading or trailing optional space
|
|
100
|
+
## c & a, etc.
|
|
101
|
+
## d'ivoire, d' ivoire
|
|
102
|
+
## borusia 'gladbach etc.
|
|
103
|
+
## exclude space ' space - why? why not? (or ignore for now)
|
|
104
|
+
##
|
|
105
|
+
## check for quotes ('') - not realy supported here
|
|
106
|
+
## e.g. leading or trailing ' will NOT match
|
|
107
|
+
|
|
108
|
+
| [ ]? & [ ]?
|
|
109
|
+
| [ ]? '
|
|
110
|
+
| ' [ ]?
|
|
111
|
+
|
|
112
|
+
#### (iii)
|
|
113
|
+
## note - special "hack" to connect WITHOUT space
|
|
114
|
+
## for Union 1.FC and SKN St.Pölten or St.Pölten
|
|
115
|
+
## connects 1.FC => NUM+WORD
|
|
116
|
+
## 1°Mayo => NUM+WORD
|
|
117
|
+
## St.Pölten => ABBREV+WORD
|
|
118
|
+
##
|
|
119
|
+
## note - match WITHOUT (space) connector
|
|
120
|
+
## 1.FC (Union 1.FC Stein)
|
|
121
|
+
## [WORD: "Union"], [NUM: "1."], [WORD: "FC"]
|
|
122
|
+
## St.Pölten (SKN St.Pölten)
|
|
123
|
+
## [WORD: "SKN"], [ABBREV: "St."], [WORD: "Pölten"]
|
|
124
|
+
| (?<= [.°] )
|
|
125
|
+
(?= \p{L})
|
|
126
|
+
)
|
|
127
|
+
(?:
|
|
128
|
+
#{PROP_KEY_NUMALPHA_}
|
|
129
|
+
| #{PROP_KEY_NUM_}
|
|
130
|
+
| #{PROP_KEY_WORD_}
|
|
131
|
+
)
|
|
132
|
+
)*
|
|
133
|
+
) ## close <key> capture
|
|
134
|
+
[ ]*? ## slurp trailing spaces
|
|
71
135
|
:
|
|
72
|
-
|
|
73
|
-
|
|
136
|
+
|
|
137
|
+
## positive lookahead (must be followed by space!!)
|
|
138
|
+
## or allow end-of-line too
|
|
139
|
+
(?= [ ]+|$)
|
|
140
|
+
) ## close <prop_key> capture
|
|
74
141
|
}ix
|
|
75
142
|
|
|
76
143
|
|
|
77
144
|
|
|
145
|
+
|
|
146
|
+
|
|
78
147
|
################
|
|
79
148
|
## todo/check - use token for card short cuts?
|
|
80
149
|
## if m[:name] == 'Y'
|
|
81
150
|
## [:YELLOW_CARD, m[:name]]
|
|
82
151
|
## elsif m[:name] == 'R'
|
|
83
152
|
## [:RED_CARD, m[:name]]
|
|
84
|
-
## - [Y], [R], [Y/R] Yellow-Red Card
|
|
85
|
-
## check if minutes possible inside [Y 46']
|
|
153
|
+
## - [Y], [R], [Y/R] Yellow-Red Card
|
|
154
|
+
## check if minutes possible inside [Y 46']
|
|
86
155
|
## add [c] for captain too
|
|
87
156
|
|
|
88
157
|
|
|
@@ -102,13 +171,13 @@ class Lexer
|
|
|
102
171
|
\+
|
|
103
172
|
(?<offset>\d{1,2})
|
|
104
173
|
'?
|
|
105
|
-
)?
|
|
106
|
-
)?
|
|
174
|
+
)?
|
|
175
|
+
)?
|
|
107
176
|
\]
|
|
108
177
|
)}x
|
|
109
178
|
|
|
110
179
|
INLINE_RED = %r{ (?<inline_red>
|
|
111
|
-
\[ [rR]
|
|
180
|
+
\[ [rR]
|
|
112
181
|
## optional minute
|
|
113
182
|
(?: [ ]+
|
|
114
183
|
(?<minute> \d{1,3})
|
|
@@ -117,14 +186,14 @@ class Lexer
|
|
|
117
186
|
\+
|
|
118
187
|
(?<offset>\d{1,2})
|
|
119
188
|
'?
|
|
120
|
-
)?
|
|
121
|
-
)?
|
|
189
|
+
)?
|
|
190
|
+
)?
|
|
122
191
|
\]
|
|
123
192
|
)}x
|
|
124
193
|
|
|
125
194
|
INLINE_YELLOW_RED = %r{ (?<inline_yellow_red>
|
|
126
195
|
\[ (?:y/r |
|
|
127
|
-
Y/R )
|
|
196
|
+
Y/R )
|
|
128
197
|
## optional minute
|
|
129
198
|
(?: [ ]+
|
|
130
199
|
(?<minute> \d{1,3})
|
|
@@ -133,8 +202,8 @@ class Lexer
|
|
|
133
202
|
\+
|
|
134
203
|
(?<offset>\d{1,2})
|
|
135
204
|
'?
|
|
136
|
-
)?
|
|
137
|
-
)?
|
|
205
|
+
)?
|
|
206
|
+
)?
|
|
138
207
|
\]
|
|
139
208
|
)}x
|
|
140
209
|
|
|
@@ -144,112 +213,57 @@ class Lexer
|
|
|
144
213
|
### simple prop key for inline use e.g.
|
|
145
214
|
### Coach: or Trainer: or ... add more here later
|
|
146
215
|
|
|
147
|
-
PROP_KEY_INLINE_RE = %r{
|
|
148
|
-
\b
|
|
216
|
+
PROP_KEY_INLINE_RE = %r{
|
|
217
|
+
\b
|
|
149
218
|
(?<prop_key> ## note: use prop_key (NOT prop_key_inline or such)
|
|
150
219
|
(?<key>
|
|
151
220
|
\p{L}+
|
|
152
221
|
)
|
|
153
|
-
## note - NO spaces allowed for key for now!!!
|
|
222
|
+
## note - NO spaces allowed for key for now!!!
|
|
154
223
|
:
|
|
155
|
-
|
|
224
|
+
## possitive lookahead (must be followed by space!!)
|
|
225
|
+
(?=[ ]+)
|
|
156
226
|
)
|
|
157
227
|
}ix
|
|
158
228
|
|
|
159
229
|
|
|
230
|
+
|
|
231
|
+
## note allow underscore inline e.g.
|
|
232
|
+
## 5_000
|
|
233
|
+
## discuss/check - allow space inline (e.g. 5 000) - why? why not?
|
|
234
|
+
|
|
160
235
|
PROP_NUM_RE = %r{
|
|
161
236
|
\b
|
|
162
237
|
(?<num>
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
## allow space inline (e.g. 5 000) - why? why not?
|
|
166
|
-
(?<value> [1-9]
|
|
167
|
-
(?: _?
|
|
168
|
-
[0-9]+
|
|
169
|
-
)*
|
|
238
|
+
(?<value> [0-9]+
|
|
239
|
+
(?: _ [0-9]+)*
|
|
170
240
|
)
|
|
171
241
|
)
|
|
172
242
|
\b
|
|
173
|
-
}
|
|
243
|
+
}x
|
|
244
|
+
|
|
174
245
|
|
|
175
246
|
### todo/fix - allow more chars in enclosed name - why? why not?
|
|
176
247
|
## e.g. (') - Cote D'Ivore etc.
|
|
177
248
|
## change to PAREN_NAME or PARENTHESIS or such - why? why not?
|
|
178
|
-
ENCLOSED_NAME_RE = %r{
|
|
179
|
-
(?<enclosed_name>
|
|
180
|
-
\(
|
|
181
|
-
(?<name>
|
|
249
|
+
ENCLOSED_NAME_RE = %r{
|
|
250
|
+
(?<enclosed_name>
|
|
251
|
+
\(
|
|
252
|
+
(?<name>
|
|
182
253
|
\p{L}+
|
|
183
254
|
(?:
|
|
184
|
-
[ ]
|
|
185
|
-
\p{L}+
|
|
255
|
+
[ ]
|
|
256
|
+
\p{L}+
|
|
186
257
|
)*
|
|
187
258
|
)
|
|
188
259
|
\)
|
|
189
260
|
)
|
|
190
261
|
}ix
|
|
191
262
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
|
198
|
-
(?<sym>
|
|
199
|
-
[;,\(\)\[\]-]
|
|
200
|
-
)
|
|
201
|
-
}ix
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
PROP_RE = Regexp.union(
|
|
206
|
-
MINUTE_RE, ## e.g. 44 or 44' or 45+1 or 45+1' etc.
|
|
207
|
-
|
|
208
|
-
INLINE_CAPTAIN, ## e.g. [c]
|
|
209
|
-
INLINE_YELLOW, ## e.g. [Y] or [Y 44] or [Y 44'] or [Y 45+1']
|
|
210
|
-
INLINE_YELLOW_RED, ## e.g. [Y/R] or [Y/R 78]
|
|
211
|
-
INLINE_RED, ## e.g. [R] or [R 42] or [R 42']
|
|
212
|
-
|
|
213
|
-
PROP_KEY_INLINE_RE,
|
|
214
|
-
PROP_NAME_RE,
|
|
215
|
-
PROP_BASICS_RE,
|
|
216
|
-
## todo/fix - add ANY_RE here too!!!
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
## note - no inline keys possible
|
|
220
|
-
## todo/fix - use custom (limited) prop basics too
|
|
221
|
-
PROP_CARDS_RE = Regexp.union(
|
|
222
|
-
MINUTE_RE,
|
|
223
|
-
PROP_NAME_RE,
|
|
224
|
-
PROP_BASICS_RE,
|
|
225
|
-
## todo/fix - add ANY_RE here too!!!
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
PROP_PENALTIES_RE = Regexp.union(
|
|
230
|
-
SCORE_RE, # e.g. 1-1 etc.
|
|
231
|
-
ENCLOSED_NAME_RE, # e.g. (save), (post), etc.
|
|
232
|
-
PROP_NAME_RE,
|
|
233
|
-
PROP_BASICS_RE,
|
|
234
|
-
## todo/fix - add ANY_RE here too!!!
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
PROP_REFEREE_RE = Regexp.union(
|
|
239
|
-
ENCLOSED_NAME_RE, # e.g. (sold out) etc. why? why not?
|
|
240
|
-
PROP_NUM_RE, # e.g. 28 000 or 28_000 (NOT 28,000 is not valid!!!)
|
|
241
|
-
PROP_KEY_INLINE_RE,
|
|
242
|
-
PROP_NAME_RE,
|
|
243
|
-
PROP_BASICS_RE,
|
|
244
|
-
## todo/fix - add ANY_RE here too!!!
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
PROP_ATTENDANCE_RE = Regexp.union(
|
|
248
|
-
ENCLOSED_NAME_RE, # e.g. (sold out) etc. why? why not?
|
|
249
|
-
PROP_NUM_RE, # e.g. 28 000 or 28_000 (NOT 28,000 is not valid!!!)
|
|
250
|
-
PROP_BASICS_RE,
|
|
251
|
-
## todo/fix - add ANY_RE here too!!!
|
|
252
|
-
)
|
|
253
|
-
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
|
|
254
268
|
end # class Lexer
|
|
255
269
|
end # module SportDb
|
|
@@ -5,69 +5,78 @@ class Lexer
|
|
|
5
5
|
##
|
|
6
6
|
## see token-text for TEXT_RE
|
|
7
7
|
## change PROP_NAME_RE to TEXT_II or TEXT_??? - why? why not?
|
|
8
|
+
### no do NO change
|
|
9
|
+
## change TEXT_RE to TEAM_RE or TEAM_NAME_RE !!!!
|
|
10
|
+
## it is NOT generic TEXT regex!!!
|
|
8
11
|
|
|
9
12
|
|
|
10
13
|
|
|
11
|
-
##
|
|
12
|
-
##
|
|
13
|
-
## FIX / FIX / FIX
|
|
14
|
-
## support match for
|
|
15
|
-
## K.-H.Förster
|
|
16
14
|
|
|
15
|
+
PROP_NAME_WORD_ = %r{
|
|
16
|
+
\p{L}+
|
|
17
|
+
\.? ## optional dot
|
|
18
|
+
}ix
|
|
17
19
|
|
|
18
20
|
|
|
21
|
+
## todo/fix - remove support for double quotes e.g. "Rodri" - why? why not?
|
|
22
|
+
##
|
|
19
23
|
|
|
20
|
-
## name different from text (does NOT allow number in name/text)
|
|
24
|
+
## name different from text (**does NOT allow number in name/text**)
|
|
25
|
+
## different from PROP_KEY too
|
|
21
26
|
PROP_NAME_RE = %r{
|
|
22
|
-
(?<prop_name>
|
|
27
|
+
(?<prop_name>
|
|
23
28
|
\b
|
|
24
29
|
(?<name>
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
#{PROP_NAME_WORD_}
|
|
31
|
+
|
|
32
|
+
## connectors
|
|
27
33
|
(?:
|
|
28
|
-
##
|
|
34
|
+
## (i) space - only one single space allowed inline!!!
|
|
29
35
|
(?:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
36
|
+
### check if negative lookbehind is redudant!!
|
|
37
|
+
## next char is \p{L} and NOT space
|
|
38
|
+
## thus double space not possible!!
|
|
39
|
+
(?<! [ ]) ## use negative lookbehind
|
|
40
|
+
[ ]
|
|
41
|
+
(?= \p{L}|['"]\p{L}) ## use lookahead
|
|
33
42
|
)
|
|
34
|
-
## support (inline) quoted name e.g. "Rodri" or such
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
" \p{L}+ "
|
|
43
|
+
## (ii) support (inline) quoted name e.g. "Rodri" or such
|
|
44
|
+
| (?:
|
|
45
|
+
(?<=[ ]) ## use positive lookbehind
|
|
46
|
+
" \p{L}+ "
|
|
39
47
|
## require space here too - why? why not?
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
48
|
+
)
|
|
49
|
+
## (iii) dash (-)
|
|
50
|
+
| (?:
|
|
51
|
+
## use POSITIVE lookBEHIND
|
|
52
|
+
## note - allow leading dot (.) e.g. K.-H.Förster
|
|
53
|
+
## short for Karl-Heinz Förster
|
|
54
|
+
##
|
|
55
|
+
## change to negative lookBEHIND [ '"-]
|
|
56
|
+
## \p{L}\. | \p{L} - not MUST be fixed size
|
|
57
|
+
(?<=
|
|
58
|
+
[\p{L}.]
|
|
59
|
+
)
|
|
46
60
|
[-] ## must be surrounded by letters
|
|
47
|
-
## note - allow leading dot (.) e.g. K.-H.Förster
|
|
48
|
-
## short for Karl-Heinz Förster
|
|
49
|
-
##
|
|
50
61
|
## e.g. One-Two NOT
|
|
51
62
|
## One- Two or One - Two or One -Two etc.
|
|
52
|
-
(
|
|
63
|
+
(?= \p{L}) ## use lookahead
|
|
53
64
|
)
|
|
54
|
-
|
|
|
65
|
+
|
|
|
55
66
|
(?: ## flex rule for quote - allow any
|
|
56
67
|
## only check for double quotes e.g. cannot follow other ' for now - why? why not?
|
|
57
68
|
## allows rodrigez 'rodri' for example
|
|
58
|
-
(?<!') ## use negative lookbehind
|
|
59
|
-
'
|
|
60
|
-
)
|
|
61
|
-
| ## standard case with letter(s) and optinal dot
|
|
62
|
-
(?: \p{L}+
|
|
63
|
-
\.? ## optional dot
|
|
69
|
+
(?<!') ## use negative lookbehind
|
|
70
|
+
'
|
|
64
71
|
)
|
|
72
|
+
| ## standard case with letter(s) and optional dot
|
|
73
|
+
#{PROP_NAME_WORD_}
|
|
65
74
|
)*
|
|
66
75
|
)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
70
|
-
}ix
|
|
76
|
+
## add lookahead - must be non-alphanum
|
|
77
|
+
## add colon (:) too - why? why not?
|
|
78
|
+
(?= [ ,;\]\)]|$)
|
|
79
|
+
)}ix
|
|
71
80
|
|
|
72
81
|
|
|
73
82
|
end # class Lexer
|
|
@@ -2,7 +2,7 @@ module SportDb
|
|
|
2
2
|
class Lexer
|
|
3
3
|
|
|
4
4
|
####
|
|
5
|
-
#
|
|
5
|
+
#
|
|
6
6
|
## note - use \A (instead of ^) - \A strictly matches the start of the string.
|
|
7
7
|
##
|
|
8
8
|
## todo - add support for trailing markers e.g.
|
|
@@ -17,45 +17,45 @@ ROUND_OUTLINE_I_RE = %r{ \A
|
|
|
17
17
|
[ ]* ## ignore leading spaces (if any)
|
|
18
18
|
(?<round_marker>
|
|
19
19
|
[▪]{1,3} ## BLACK SMALL SQUARE e.g. ▪,▪▪,▪▪▪
|
|
20
|
-
)
|
|
20
|
+
)
|
|
21
21
|
[ ]+
|
|
22
22
|
(?<round_outline>
|
|
23
23
|
## must start with letter - why? why not?
|
|
24
24
|
### 1st round
|
|
25
|
-
## allow numbers e.g. Group A - 1
|
|
26
|
-
##
|
|
25
|
+
## allow numbers e.g. Group A - 1
|
|
26
|
+
##
|
|
27
27
|
## note - CANNOT incl. :| !!!
|
|
28
28
|
## used for markers for defs/definitions
|
|
29
|
-
[^:|]+? ## use non-greedy
|
|
29
|
+
[^:|]+? ## use non-greedy
|
|
30
30
|
)
|
|
31
31
|
(?:
|
|
32
|
-
[ ]+
|
|
32
|
+
[ ]+
|
|
33
33
|
[▪]+
|
|
34
34
|
)?
|
|
35
|
-
[ ]* ## ignore trailing spaces (if any)
|
|
35
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
36
36
|
\z
|
|
37
37
|
}xi
|
|
38
38
|
|
|
39
39
|
ROUND_OUTLINE_II_RE = %r{ \A
|
|
40
40
|
[ ]* ## ignore leading spaces (if any)
|
|
41
41
|
(?<round_marker>
|
|
42
|
-
::{1,3} ## e.g. ::,:::,::::
|
|
43
|
-
)
|
|
42
|
+
::{1,3} ## e.g. ::,:::,::::
|
|
43
|
+
)
|
|
44
44
|
[ ]+
|
|
45
45
|
(?<round_outline>
|
|
46
46
|
## must start with letter - why? why not?
|
|
47
47
|
### 1st round
|
|
48
|
-
## allow numbers e.g. Group A - 1
|
|
49
|
-
##
|
|
48
|
+
## allow numbers e.g. Group A - 1
|
|
49
|
+
##
|
|
50
50
|
## note - CANNOT incl. :| !!!
|
|
51
51
|
## used for markers for defs/definitions
|
|
52
|
-
[^:|]+? ## use non-greedy
|
|
52
|
+
[^:|]+? ## use non-greedy
|
|
53
53
|
)
|
|
54
54
|
(?:
|
|
55
|
-
[ ]+
|
|
55
|
+
[ ]+
|
|
56
56
|
::+
|
|
57
57
|
)?
|
|
58
|
-
[ ]* ## ignore trailing spaces (if any)
|
|
58
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
59
59
|
\z
|
|
60
60
|
}xi
|
|
61
61
|
|
|
@@ -65,38 +65,24 @@ ROUND_OUTLINE_RE = Regexp.union( ROUND_OUTLINE_I_RE,
|
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
###
|
|
68
|
-
# note - for def(initions) only one level support
|
|
68
|
+
# note - for def(initions) only one level support
|
|
69
69
|
# that is, no round outline additions possible (e.g ▪▪ 1st leg etc.)
|
|
70
70
|
ROUND_DEF_OUTLINE_RE = %r{ \A
|
|
71
71
|
[ ]* ## ignore leading spaces (if any)
|
|
72
72
|
(?: [▪] ## BLACK SMALL SQUARE
|
|
73
73
|
|
|
|
74
|
-
:: )
|
|
74
|
+
:: )
|
|
75
75
|
[ ]+
|
|
76
76
|
(?<round_outline>
|
|
77
|
-
[^:|]+? ## use non-greedy
|
|
77
|
+
[^:|]+? ## use non-greedy
|
|
78
78
|
)
|
|
79
|
-
[ ]* ## ignore trailing spaces (if any)
|
|
80
|
-
### possitive lookahead MUST be : OR |
|
|
81
|
-
(?= [:|]
|
|
82
|
-
[ ]) ## note: requires space for now after [:|] - keep - why? why not?
|
|
79
|
+
[ ]* ## ignore trailing spaces (if any)
|
|
80
|
+
### possitive lookahead MUST be : OR |
|
|
81
|
+
(?= [:|]
|
|
82
|
+
[ ]) ## note: requires space for now after [:|] - keep - why? why not?
|
|
83
83
|
}ix
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
ROUND_DEF_BASICS_RE = %r{
|
|
87
|
-
(?<spaces> [ ]{2,}) |
|
|
88
|
-
(?<space> [ ])
|
|
89
|
-
|
|
|
90
|
-
(?<sym> [:|,] ) ### note - add comma (,) as optional separator
|
|
91
|
-
}ix
|
|
92
|
-
|
|
93
|
-
ROUND_DEF_RE = Regexp.union( ROUND_DEF_BASICS_RE,
|
|
94
|
-
DURATION_RE, # note - duration MUST match before date
|
|
95
|
-
DATE_RE, ## note - date must go before time (e.g. 12.12. vs 12.12)
|
|
96
|
-
ANY_RE,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
|
|
100
86
|
|
|
101
87
|
end # class Lexer
|
|
102
88
|
end # module SportDb
|