sportdb-parser 0.6.17 → 0.6.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1ba31cc4284de6a4ea05615ad37a30546375c5e3218847a8f69c3c359074fb9c
4
- data.tar.gz: 5b8c73196c6bd08a399c7687cd65f4e05f4c3a66580eaeecd6dbaad33837b822
3
+ metadata.gz: 45e7965659ecd817aa2f54d04f81c673c6b7be3c1ea3294bf978b25462786726
4
+ data.tar.gz: c897aac96c5229c589af5966e68497d1dcd28b748241a15d0f386a2b65265b6a
5
5
  SHA512:
6
- metadata.gz: cf4bc5b5a112effc59895c405e3484224d4d94aadf4771fecff90458b01aa63b43703632fdada70df9ee906aa8e517b1736dbdb2de795ba53b3b8e05d6c1ba4c
7
- data.tar.gz: 46b6a97cb0af77debec3bd58ee24bd12b1236680dc53ff76a50b725621961bb9aabdfc151d9379d01672ebbef2423c3ce4f4d948adc506789c846491a7f30f15
6
+ metadata.gz: f918e333fba1dae8abcc3f143db0ef39b9c64e4fcaf89469482173c5b2944ba9a4e58cb9e71642c6fa5f14069609a40df23a62f6a0cc697bd22b465e73a17c97
7
+ data.tar.gz: 0c39baca8b5bd3d05720c1ade1c492361b186268ab549819e2428d41d2c2bab34424deefe328ca63504cf1bc802694edbea58e438e6225d7f2c69e325b662213
data/CHANGELOG.md CHANGED
@@ -1,4 +1,4 @@
1
- ### 0.6.17
1
+ ### 0.6.18
2
2
  ### 0.0.1 / 2024-07-12
3
3
 
4
4
  * Everything is new. First release.
data/Manifest.txt CHANGED
@@ -15,6 +15,7 @@ lib/sportdb/parser/parser.rb
15
15
  lib/sportdb/parser/racc_parser.rb
16
16
  lib/sportdb/parser/racc_tree.rb
17
17
  lib/sportdb/parser/token-date.rb
18
+ lib/sportdb/parser/token-geo.rb
18
19
  lib/sportdb/parser/token-minute.rb
19
20
  lib/sportdb/parser/token-prop.rb
20
21
  lib/sportdb/parser/token-score.rb
@@ -0,0 +1,134 @@
1
+ module SportDb
2
+ class Lexer
3
+
4
+
5
+ ##
6
+ # allow Cote'd Ivoir or such
7
+ ## e.g. add '
8
+
9
+
10
+ ## todo/fix - make geo text regex more generic
11
+ ## only care about two space rule
12
+
13
+
14
+ GEO_TEXT_RE = %r{
15
+ ## must start with alpha (allow unicode letters!!)
16
+ (?<text>
17
+ ## positive lookbehind - for now space (or beginning of line - for testing) only
18
+ ## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
19
+ (?<= [ ,›>\[\]]|^)
20
+ (?:
21
+ # opt 1 - start with alpha
22
+ \p{L}+ ## all unicode letters (e.g. [a-z])
23
+ |
24
+ # opt 2 - start with num!! -
25
+ \d+ # check for num lookahead (MUST be space or dot)
26
+ ## MAY be followed by (optional space) !
27
+ ## MUST be follow by a to z!!!!
28
+ [ ]? ## make space optional too - why? why not?
29
+ ## yes - eg. 1st, 2nd, 5th etc.
30
+ \p{L}+
31
+ |
32
+ ## opt 3 - add another weirdo case
33
+ ## e.g. 's Gravenwezel-Schilde
34
+ ## add more letters (or sequences here - why? why not?)
35
+ '\p{L}+
36
+ )
37
+
38
+ ##
39
+ ## todo/check - find a different "more intuitive" regex/rule if possible?
40
+ ## for single spaces only (and _/ MUST not be surround by spaces)
41
+
42
+ (?:
43
+ [ ]? # only single spaces allowed inline!!!
44
+ (?:
45
+ \p{L} | \d | [.&'°]
46
+ |
47
+ (?: (?<! [ ]) ## no space allowed before (but possible after)
48
+ [-]
49
+ )
50
+ |
51
+ (?: (?<! [ ]) ## no spaces allowed around these characters
52
+ [_/]
53
+ (?! [ ])
54
+ )
55
+ )+
56
+ )*
57
+
58
+ ## must NOT end with space or dash(-)
59
+ ## todo/fix - possible in regex here
60
+ ## only end in alphanum a-z0-9 (not dot or & ???)
61
+
62
+ ## add lookahead/lookbehind
63
+ ## must be space!!!
64
+ ## (or comma or start/end of string)
65
+ ## kind of \b !!!
66
+ ## positive lookahead
67
+ (?=[ ,›>\[\]]|$)
68
+ )
69
+ }ix
70
+
71
+
72
+
73
+ ##
74
+ # for timezone format use for now:
75
+ # (BRT/UTC-3) (e.g. brazil time)
76
+ #
77
+ # (CET/UTC+1) - central european time
78
+ # (CEST/UTC+2) - central european summer time - daylight saving time (DST).
79
+ # (EET/UTC+1) - eastern european time
80
+ # (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
81
+ #
82
+ # UTC+3
83
+ # UTC+4
84
+ # UTC+0
85
+ # UTC+00
86
+ # UTC+0000
87
+ #
88
+ # - allow +01 or +0100 - why? why not
89
+ # - +0130 (01:30)
90
+ #
91
+ # see
92
+ # https://en.wikipedia.org/wiki/Time_zone
93
+ # https://en.wikipedia.org/wiki/List_of_UTC_offsets
94
+ # https://en.wikipedia.org/wiki/UTC−04:00 etc.
95
+ #
96
+ # e.g. (UTC-2) or (CEST/UTC-2) etc.
97
+ # todo check - only allow upcase
98
+ # or (utc-2) and (cest/utc-2) too - why? why not?
99
+
100
+ TIMEZONE_RE = %r{
101
+ (?<timezone>
102
+ \(
103
+ ## optional "local" timezone name eg. BRT or CEST etc.
104
+ (?: [a-z]+
105
+ /
106
+ )?
107
+ [a-z]+
108
+ [+-]
109
+ \d{1,4} ## e.g. 0 or 00 or 0000
110
+ \)
111
+ )
112
+ }ix
113
+
114
+
115
+
116
+ GEO_BASICS_RE = %r{
117
+ (?<spaces> [ ]{2,}) |
118
+ (?<space> [ ])
119
+ |
120
+ (?<sym> [,›>\[] )
121
+ }ix
122
+
123
+
124
+
125
+
126
+ GEO_RE = Regexp.union(
127
+ TIMEZONE_RE,
128
+ GEO_BASICS_RE,
129
+ GEO_TEXT_RE,
130
+ ANY_RE,
131
+ )
132
+
133
+ end # class Lexer
134
+ end # module SportDb
@@ -2,6 +2,17 @@ module SportDb
2
2
  class Lexer
3
3
 
4
4
 
5
+
6
+ ## todo - use ANY_RE to token_commons or such - for shared by many?
7
+
8
+ ## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
9
+ ## to avoid advance of pos match!!!
10
+ ANY_RE = %r{
11
+ (?<any> .)
12
+ }ix
13
+
14
+
15
+
5
16
  ## note - do NOT allow single alpha text for now
6
17
  ## add later?? A - B C - D - why?
7
18
  ## opt 1) one alpha
@@ -68,25 +79,18 @@ TEXT_RE = %r{
68
79
  \d+\.-\d+\. [ ]? \p{L}+
69
80
  )
70
81
 
71
- (?:(?: (?:[ ]
82
+ (?:(?: (?:[ ] # only single spaces allowed inline!!!
72
83
  (?! (?-i: vs?[ ])
73
84
  ) ## note - exclude (v[ ]/vs[ ])
74
85
  ## AND switch to case-sensitive (via -i!!!)
75
86
  )
76
- | # only single spaces allowed inline!!!
77
- [_/]
87
+ |
88
+ [/-] ## must NOT be surrounded by spaces
78
89
  )?
79
90
  (?:
80
91
  \p{L}
81
92
  |
82
- [&'°]
83
- |
84
- (?: (?<! [ ]) ## todo - check regex - make sure lookbehind is always first/before!!
85
- [-] ### allow e.g. Sport- if lookbehind is unicode letter or dot (.)
86
- ### or U.N.A.M.-Pumas
87
- ## (?<= [\p{L}.] )
88
- ## try more flexible (use negative lookbehind - no space)
89
- )
93
+ [.&'°]
90
94
  |
91
95
  (?:
92
96
  \d+
@@ -99,8 +103,7 @@ TEXT_RE = %r{
99
103
  ## negative lookahead for numbers
100
104
  ## note - include digits itself!!!
101
105
  ## note - remove / (slash) e.g. allows UDI'19/Beter Bed
102
- )|
103
- \.
106
+ )
104
107
  )
105
108
  )* ## must NOT end with space or dash(-)
106
109
  ## todo/fix - possible in regex here
@@ -21,49 +21,6 @@ TIME_RE = %r{
21
21
 
22
22
 
23
23
 
24
- ##
25
- # for timezone format use for now:
26
- # (BRT/UTC-3) (e.g. brazil time)
27
- #
28
- # (CET/UTC+1) - central european time
29
- # (CEST/UTC+2) - central european summer time - daylight saving time (DST).
30
- # (EET/UTC+1) - eastern european time
31
- # (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
32
- #
33
- # UTC+3
34
- # UTC+4
35
- # UTC+0
36
- # UTC+00
37
- # UTC+0000
38
- #
39
- # - allow +01 or +0100 - why? why not
40
- # - +0130 (01:30)
41
- #
42
- # see
43
- # https://en.wikipedia.org/wiki/Time_zone
44
- # https://en.wikipedia.org/wiki/List_of_UTC_offsets
45
- # https://en.wikipedia.org/wiki/UTC−04:00 etc.
46
- #
47
- # e.g. (UTC-2) or (CEST/UTC-2) etc.
48
- # todo check - only allow upcase
49
- # or (utc-2) and (cest/utc-2) too - why? why not?
50
-
51
- TIMEZONE_RE = %r{
52
- (?<timezone>
53
- \(
54
- ## optional "local" timezone name eg. BRT or CEST etc.
55
- (?: [a-z]+
56
- /
57
- )?
58
- [a-z]+
59
- [+-]
60
- \d{1,4} ## e.g. 0 or 00 or 0000
61
- \)
62
- )
63
- }ix
64
-
65
-
66
-
67
24
  ## add wday / stand-alone week day - as separate regex or
68
25
  ## use TEXT with is_wday? check or such with
69
26
  ## requirement of beginning of line (anchored to line) only??
@@ -124,12 +81,6 @@ BASICS_RE = %r{
124
81
  }ix
125
82
 
126
83
 
127
- ## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
128
- ## to avoid advance of pos match!!!
129
- ANY_RE = %r{
130
- (?<any> .)
131
- }ix
132
-
133
84
 
134
85
  RE = Regexp.union(
135
86
  STATUS_RE,
@@ -148,21 +99,6 @@ RE = Regexp.union(
148
99
  )
149
100
 
150
101
 
151
- GEO_BASICS_RE = %r{
152
- (?<spaces> [ ]{2,}) |
153
- (?<space> [ ])
154
- |
155
- (?<sym> [,›>\[] )
156
- }ix
157
-
158
-
159
- GEO_RE = Regexp.union(
160
- TIMEZONE_RE,
161
- GEO_BASICS_RE,
162
- TEXT_RE,
163
- ANY_RE,
164
- )
165
-
166
102
 
167
103
  ######################################################
168
104
  ## goal mode (switched to by PLAYER_WITH_MINUTE_RE)
@@ -4,7 +4,7 @@ module SportDb
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
6
  MINOR = 6
7
- PATCH = 17
7
+ PATCH = 18
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -23,6 +23,7 @@ require_relative 'parser/token-text'
23
23
  require_relative 'parser/token-status'
24
24
  require_relative 'parser/token-minute'
25
25
  require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
26
+ require_relative 'parser/token-geo'
26
27
  require_relative 'parser/token'
27
28
  require_relative 'parser/lexer'
28
29
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.17
4
+ version: 0.6.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-03-10 00:00:00.000000000 Z
11
+ date: 2025-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos
@@ -104,6 +104,7 @@ files:
104
104
  - lib/sportdb/parser/racc_parser.rb
105
105
  - lib/sportdb/parser/racc_tree.rb
106
106
  - lib/sportdb/parser/token-date.rb
107
+ - lib/sportdb/parser/token-geo.rb
107
108
  - lib/sportdb/parser/token-minute.rb
108
109
  - lib/sportdb/parser/token-prop.rb
109
110
  - lib/sportdb/parser/token-score.rb