sportdb-parser 0.6.17 → 0.6.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +1 -0
- data/lib/sportdb/parser/token-geo.rb +134 -0
- data/lib/sportdb/parser/token-text.rb +16 -13
- data/lib/sportdb/parser/token.rb +0 -64
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45e7965659ecd817aa2f54d04f81c673c6b7be3c1ea3294bf978b25462786726
|
4
|
+
data.tar.gz: c897aac96c5229c589af5966e68497d1dcd28b748241a15d0f386a2b65265b6a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f918e333fba1dae8abcc3f143db0ef39b9c64e4fcaf89469482173c5b2944ba9a4e58cb9e71642c6fa5f14069609a40df23a62f6a0cc697bd22b465e73a17c97
|
7
|
+
data.tar.gz: 0c39baca8b5bd3d05720c1ade1c492361b186268ab549819e2428d41d2c2bab34424deefe328ca63504cf1bc802694edbea58e438e6225d7f2c69e325b662213
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -15,6 +15,7 @@ lib/sportdb/parser/parser.rb
|
|
15
15
|
lib/sportdb/parser/racc_parser.rb
|
16
16
|
lib/sportdb/parser/racc_tree.rb
|
17
17
|
lib/sportdb/parser/token-date.rb
|
18
|
+
lib/sportdb/parser/token-geo.rb
|
18
19
|
lib/sportdb/parser/token-minute.rb
|
19
20
|
lib/sportdb/parser/token-prop.rb
|
20
21
|
lib/sportdb/parser/token-score.rb
|
@@ -0,0 +1,134 @@
|
|
1
|
+
module SportDb
|
2
|
+
class Lexer
|
3
|
+
|
4
|
+
|
5
|
+
##
|
6
|
+
# allow Cote'd Ivoir or such
|
7
|
+
## e.g. add '
|
8
|
+
|
9
|
+
|
10
|
+
## todo/fix - make geo text regex more generic
|
11
|
+
## only care about two space rule
|
12
|
+
|
13
|
+
|
14
|
+
GEO_TEXT_RE = %r{
|
15
|
+
## must start with alpha (allow unicode letters!!)
|
16
|
+
(?<text>
|
17
|
+
## positive lookbehind - for now space (or beginning of line - for testing) only
|
18
|
+
## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
|
19
|
+
(?<= [ ,›>\[\]]|^)
|
20
|
+
(?:
|
21
|
+
# opt 1 - start with alpha
|
22
|
+
\p{L}+ ## all unicode letters (e.g. [a-z])
|
23
|
+
|
|
24
|
+
# opt 2 - start with num!! -
|
25
|
+
\d+ # check for num lookahead (MUST be space or dot)
|
26
|
+
## MAY be followed by (optional space) !
|
27
|
+
## MUST be follow by a to z!!!!
|
28
|
+
[ ]? ## make space optional too - why? why not?
|
29
|
+
## yes - eg. 1st, 2nd, 5th etc.
|
30
|
+
\p{L}+
|
31
|
+
|
|
32
|
+
## opt 3 - add another weirdo case
|
33
|
+
## e.g. 's Gravenwezel-Schilde
|
34
|
+
## add more letters (or sequences here - why? why not?)
|
35
|
+
'\p{L}+
|
36
|
+
)
|
37
|
+
|
38
|
+
##
|
39
|
+
## todo/check - find a different "more intuitive" regex/rule if possible?
|
40
|
+
## for single spaces only (and _/ MUST not be surround by spaces)
|
41
|
+
|
42
|
+
(?:
|
43
|
+
[ ]? # only single spaces allowed inline!!!
|
44
|
+
(?:
|
45
|
+
\p{L} | \d | [.&'°]
|
46
|
+
|
|
47
|
+
(?: (?<! [ ]) ## no space allowed before (but possible after)
|
48
|
+
[-]
|
49
|
+
)
|
50
|
+
|
|
51
|
+
(?: (?<! [ ]) ## no spaces allowed around these characters
|
52
|
+
[_/]
|
53
|
+
(?! [ ])
|
54
|
+
)
|
55
|
+
)+
|
56
|
+
)*
|
57
|
+
|
58
|
+
## must NOT end with space or dash(-)
|
59
|
+
## todo/fix - possible in regex here
|
60
|
+
## only end in alphanum a-z0-9 (not dot or & ???)
|
61
|
+
|
62
|
+
## add lookahead/lookbehind
|
63
|
+
## must be space!!!
|
64
|
+
## (or comma or start/end of string)
|
65
|
+
## kind of \b !!!
|
66
|
+
## positive lookahead
|
67
|
+
(?=[ ,›>\[\]]|$)
|
68
|
+
)
|
69
|
+
}ix
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
##
|
74
|
+
# for timezone format use for now:
|
75
|
+
# (BRT/UTC-3) (e.g. brazil time)
|
76
|
+
#
|
77
|
+
# (CET/UTC+1) - central european time
|
78
|
+
# (CEST/UTC+2) - central european summer time - daylight saving time (DST).
|
79
|
+
# (EET/UTC+1) - eastern european time
|
80
|
+
# (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
|
81
|
+
#
|
82
|
+
# UTC+3
|
83
|
+
# UTC+4
|
84
|
+
# UTC+0
|
85
|
+
# UTC+00
|
86
|
+
# UTC+0000
|
87
|
+
#
|
88
|
+
# - allow +01 or +0100 - why? why not
|
89
|
+
# - +0130 (01:30)
|
90
|
+
#
|
91
|
+
# see
|
92
|
+
# https://en.wikipedia.org/wiki/Time_zone
|
93
|
+
# https://en.wikipedia.org/wiki/List_of_UTC_offsets
|
94
|
+
# https://en.wikipedia.org/wiki/UTC−04:00 etc.
|
95
|
+
#
|
96
|
+
# e.g. (UTC-2) or (CEST/UTC-2) etc.
|
97
|
+
# todo check - only allow upcase
|
98
|
+
# or (utc-2) and (cest/utc-2) too - why? why not?
|
99
|
+
|
100
|
+
TIMEZONE_RE = %r{
|
101
|
+
(?<timezone>
|
102
|
+
\(
|
103
|
+
## optional "local" timezone name eg. BRT or CEST etc.
|
104
|
+
(?: [a-z]+
|
105
|
+
/
|
106
|
+
)?
|
107
|
+
[a-z]+
|
108
|
+
[+-]
|
109
|
+
\d{1,4} ## e.g. 0 or 00 or 0000
|
110
|
+
\)
|
111
|
+
)
|
112
|
+
}ix
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
GEO_BASICS_RE = %r{
|
117
|
+
(?<spaces> [ ]{2,}) |
|
118
|
+
(?<space> [ ])
|
119
|
+
|
|
120
|
+
(?<sym> [,›>\[] )
|
121
|
+
}ix
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
GEO_RE = Regexp.union(
|
127
|
+
TIMEZONE_RE,
|
128
|
+
GEO_BASICS_RE,
|
129
|
+
GEO_TEXT_RE,
|
130
|
+
ANY_RE,
|
131
|
+
)
|
132
|
+
|
133
|
+
end # class Lexer
|
134
|
+
end # module SportDb
|
@@ -2,6 +2,17 @@ module SportDb
|
|
2
2
|
class Lexer
|
3
3
|
|
4
4
|
|
5
|
+
|
6
|
+
## todo - use ANY_RE to token_commons or such - for shared by many?
|
7
|
+
|
8
|
+
## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
|
9
|
+
## to avoid advance of pos match!!!
|
10
|
+
ANY_RE = %r{
|
11
|
+
(?<any> .)
|
12
|
+
}ix
|
13
|
+
|
14
|
+
|
15
|
+
|
5
16
|
## note - do NOT allow single alpha text for now
|
6
17
|
## add later?? A - B C - D - why?
|
7
18
|
## opt 1) one alpha
|
@@ -68,25 +79,18 @@ TEXT_RE = %r{
|
|
68
79
|
\d+\.-\d+\. [ ]? \p{L}+
|
69
80
|
)
|
70
81
|
|
71
|
-
(?:(?: (?:[ ]
|
82
|
+
(?:(?: (?:[ ] # only single spaces allowed inline!!!
|
72
83
|
(?! (?-i: vs?[ ])
|
73
84
|
) ## note - exclude (v[ ]/vs[ ])
|
74
85
|
## AND switch to case-sensitive (via -i!!!)
|
75
86
|
)
|
76
|
-
|
|
77
|
-
[
|
87
|
+
|
|
88
|
+
[/-] ## must NOT be surrounded by spaces
|
78
89
|
)?
|
79
90
|
(?:
|
80
91
|
\p{L}
|
81
92
|
|
|
82
|
-
[
|
83
|
-
|
|
84
|
-
(?: (?<! [ ]) ## todo - check regex - make sure lookbehind is always first/before!!
|
85
|
-
[-] ### allow e.g. Sport- if lookbehind is unicode letter or dot (.)
|
86
|
-
### or U.N.A.M.-Pumas
|
87
|
-
## (?<= [\p{L}.] )
|
88
|
-
## try more flexible (use negative lookbehind - no space)
|
89
|
-
)
|
93
|
+
[.&'°]
|
90
94
|
|
|
91
95
|
(?:
|
92
96
|
\d+
|
@@ -99,8 +103,7 @@ TEXT_RE = %r{
|
|
99
103
|
## negative lookahead for numbers
|
100
104
|
## note - include digits itself!!!
|
101
105
|
## note - remove / (slash) e.g. allows UDI'19/Beter Bed
|
102
|
-
)
|
103
|
-
\.
|
106
|
+
)
|
104
107
|
)
|
105
108
|
)* ## must NOT end with space or dash(-)
|
106
109
|
## todo/fix - possible in regex here
|
data/lib/sportdb/parser/token.rb
CHANGED
@@ -21,49 +21,6 @@ TIME_RE = %r{
|
|
21
21
|
|
22
22
|
|
23
23
|
|
24
|
-
##
|
25
|
-
# for timezone format use for now:
|
26
|
-
# (BRT/UTC-3) (e.g. brazil time)
|
27
|
-
#
|
28
|
-
# (CET/UTC+1) - central european time
|
29
|
-
# (CEST/UTC+2) - central european summer time - daylight saving time (DST).
|
30
|
-
# (EET/UTC+1) - eastern european time
|
31
|
-
# (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
|
32
|
-
#
|
33
|
-
# UTC+3
|
34
|
-
# UTC+4
|
35
|
-
# UTC+0
|
36
|
-
# UTC+00
|
37
|
-
# UTC+0000
|
38
|
-
#
|
39
|
-
# - allow +01 or +0100 - why? why not
|
40
|
-
# - +0130 (01:30)
|
41
|
-
#
|
42
|
-
# see
|
43
|
-
# https://en.wikipedia.org/wiki/Time_zone
|
44
|
-
# https://en.wikipedia.org/wiki/List_of_UTC_offsets
|
45
|
-
# https://en.wikipedia.org/wiki/UTC−04:00 etc.
|
46
|
-
#
|
47
|
-
# e.g. (UTC-2) or (CEST/UTC-2) etc.
|
48
|
-
# todo check - only allow upcase
|
49
|
-
# or (utc-2) and (cest/utc-2) too - why? why not?
|
50
|
-
|
51
|
-
TIMEZONE_RE = %r{
|
52
|
-
(?<timezone>
|
53
|
-
\(
|
54
|
-
## optional "local" timezone name eg. BRT or CEST etc.
|
55
|
-
(?: [a-z]+
|
56
|
-
/
|
57
|
-
)?
|
58
|
-
[a-z]+
|
59
|
-
[+-]
|
60
|
-
\d{1,4} ## e.g. 0 or 00 or 0000
|
61
|
-
\)
|
62
|
-
)
|
63
|
-
}ix
|
64
|
-
|
65
|
-
|
66
|
-
|
67
24
|
## add wday / stand-alone week day - as separate regex or
|
68
25
|
## use TEXT with is_wday? check or such with
|
69
26
|
## requirement of beginning of line (anchored to line) only??
|
@@ -124,12 +81,6 @@ BASICS_RE = %r{
|
|
124
81
|
}ix
|
125
82
|
|
126
83
|
|
127
|
-
## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
|
128
|
-
## to avoid advance of pos match!!!
|
129
|
-
ANY_RE = %r{
|
130
|
-
(?<any> .)
|
131
|
-
}ix
|
132
|
-
|
133
84
|
|
134
85
|
RE = Regexp.union(
|
135
86
|
STATUS_RE,
|
@@ -148,21 +99,6 @@ RE = Regexp.union(
|
|
148
99
|
)
|
149
100
|
|
150
101
|
|
151
|
-
GEO_BASICS_RE = %r{
|
152
|
-
(?<spaces> [ ]{2,}) |
|
153
|
-
(?<space> [ ])
|
154
|
-
|
|
155
|
-
(?<sym> [,›>\[] )
|
156
|
-
}ix
|
157
|
-
|
158
|
-
|
159
|
-
GEO_RE = Regexp.union(
|
160
|
-
TIMEZONE_RE,
|
161
|
-
GEO_BASICS_RE,
|
162
|
-
TEXT_RE,
|
163
|
-
ANY_RE,
|
164
|
-
)
|
165
|
-
|
166
102
|
|
167
103
|
######################################################
|
168
104
|
## goal mode (switched to by PLAYER_WITH_MINUTE_RE)
|
data/lib/sportdb/parser.rb
CHANGED
@@ -23,6 +23,7 @@ require_relative 'parser/token-text'
|
|
23
23
|
require_relative 'parser/token-status'
|
24
24
|
require_relative 'parser/token-minute'
|
25
25
|
require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
|
26
|
+
require_relative 'parser/token-geo'
|
26
27
|
require_relative 'parser/token'
|
27
28
|
require_relative 'parser/lexer'
|
28
29
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-03-
|
11
|
+
date: 2025-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/sportdb/parser/racc_parser.rb
|
105
105
|
- lib/sportdb/parser/racc_tree.rb
|
106
106
|
- lib/sportdb/parser/token-date.rb
|
107
|
+
- lib/sportdb/parser/token-geo.rb
|
107
108
|
- lib/sportdb/parser/token-minute.rb
|
108
109
|
- lib/sportdb/parser/token-prop.rb
|
109
110
|
- lib/sportdb/parser/token-score.rb
|