sportdb-parser 0.6.16 → 0.6.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3da9280d27bf1e4662eb10f9451679e4aace18b9a0e1bfa29dd1e7b6bcbdc5e5
4
- data.tar.gz: e6786f648848cd075ef3e0f6d8d7fda2d31743f989653c0fcf2312a33a223357
3
+ metadata.gz: 45e7965659ecd817aa2f54d04f81c673c6b7be3c1ea3294bf978b25462786726
4
+ data.tar.gz: c897aac96c5229c589af5966e68497d1dcd28b748241a15d0f386a2b65265b6a
5
5
  SHA512:
6
- metadata.gz: 04250d17d120c12dc0b3980ff971b02fa178e617f35af70651f86011d9f5d4cad1d81df84a1f5af97ab73cb9023cc6cb190b13c420af71f3bcb2af7df6a526f1
7
- data.tar.gz: 120486063a9a82891a63914654965b799aef774680695de8bda3bb52894399d0800b98efb852f87f672dbe303dc6c415b91a10a094989e94ddf3e319b3183cc9
6
+ metadata.gz: f918e333fba1dae8abcc3f143db0ef39b9c64e4fcaf89469482173c5b2944ba9a4e58cb9e71642c6fa5f14069609a40df23a62f6a0cc697bd22b465e73a17c97
7
+ data.tar.gz: 0c39baca8b5bd3d05720c1ade1c492361b186268ab549819e2428d41d2c2bab34424deefe328ca63504cf1bc802694edbea58e438e6225d7f2c69e325b662213
data/CHANGELOG.md CHANGED
@@ -1,4 +1,4 @@
1
- ### 0.6.16
1
+ ### 0.6.18
2
2
  ### 0.0.1 / 2024-07-12
3
3
 
4
4
  * Everything is new. First release.
data/Manifest.txt CHANGED
@@ -15,6 +15,7 @@ lib/sportdb/parser/parser.rb
15
15
  lib/sportdb/parser/racc_parser.rb
16
16
  lib/sportdb/parser/racc_tree.rb
17
17
  lib/sportdb/parser/token-date.rb
18
+ lib/sportdb/parser/token-geo.rb
18
19
  lib/sportdb/parser/token-minute.rb
19
20
  lib/sportdb/parser/token-prop.rb
20
21
  lib/sportdb/parser/token-score.rb
@@ -0,0 +1,134 @@
1
+ module SportDb
2
+ class Lexer
3
+
4
+
5
+ ##
6
+ # allow Cote'd Ivoir or such
7
+ ## e.g. add '
8
+
9
+
10
+ ## todo/fix - make geo text regex more generic
11
+ ## only care about two space rule
12
+
13
+
14
+ GEO_TEXT_RE = %r{
15
+ ## must start with alpha (allow unicode letters!!)
16
+ (?<text>
17
+ ## positive lookbehind - for now space (or beginning of line - for testing) only
18
+ ## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
19
+ (?<= [ ,›>\[\]]|^)
20
+ (?:
21
+ # opt 1 - start with alpha
22
+ \p{L}+ ## all unicode letters (e.g. [a-z])
23
+ |
24
+ # opt 2 - start with num!! -
25
+ \d+ # check for num lookahead (MUST be space or dot)
26
+ ## MAY be followed by (optional space) !
27
+ ## MUST be follow by a to z!!!!
28
+ [ ]? ## make space optional too - why? why not?
29
+ ## yes - eg. 1st, 2nd, 5th etc.
30
+ \p{L}+
31
+ |
32
+ ## opt 3 - add another weirdo case
33
+ ## e.g. 's Gravenwezel-Schilde
34
+ ## add more letters (or sequences here - why? why not?)
35
+ '\p{L}+
36
+ )
37
+
38
+ ##
39
+ ## todo/check - find a different "more intuitive" regex/rule if possible?
40
+ ## for single spaces only (and _/ MUST not be surround by spaces)
41
+
42
+ (?:
43
+ [ ]? # only single spaces allowed inline!!!
44
+ (?:
45
+ \p{L} | \d | [.&'°]
46
+ |
47
+ (?: (?<! [ ]) ## no space allowed before (but possible after)
48
+ [-]
49
+ )
50
+ |
51
+ (?: (?<! [ ]) ## no spaces allowed around these characters
52
+ [_/]
53
+ (?! [ ])
54
+ )
55
+ )+
56
+ )*
57
+
58
+ ## must NOT end with space or dash(-)
59
+ ## todo/fix - possible in regex here
60
+ ## only end in alphanum a-z0-9 (not dot or & ???)
61
+
62
+ ## add lookahead/lookbehind
63
+ ## must be space!!!
64
+ ## (or comma or start/end of string)
65
+ ## kind of \b !!!
66
+ ## positive lookahead
67
+ (?=[ ,›>\[\]]|$)
68
+ )
69
+ }ix
70
+
71
+
72
+
73
+ ##
74
+ # for timezone format use for now:
75
+ # (BRT/UTC-3) (e.g. brazil time)
76
+ #
77
+ # (CET/UTC+1) - central european time
78
+ # (CEST/UTC+2) - central european summer time - daylight saving time (DST).
79
+ # (EET/UTC+1) - eastern european time
80
+ # (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
81
+ #
82
+ # UTC+3
83
+ # UTC+4
84
+ # UTC+0
85
+ # UTC+00
86
+ # UTC+0000
87
+ #
88
+ # - allow +01 or +0100 - why? why not
89
+ # - +0130 (01:30)
90
+ #
91
+ # see
92
+ # https://en.wikipedia.org/wiki/Time_zone
93
+ # https://en.wikipedia.org/wiki/List_of_UTC_offsets
94
+ # https://en.wikipedia.org/wiki/UTC−04:00 etc.
95
+ #
96
+ # e.g. (UTC-2) or (CEST/UTC-2) etc.
97
+ # todo check - only allow upcase
98
+ # or (utc-2) and (cest/utc-2) too - why? why not?
99
+
100
+ TIMEZONE_RE = %r{
101
+ (?<timezone>
102
+ \(
103
+ ## optional "local" timezone name eg. BRT or CEST etc.
104
+ (?: [a-z]+
105
+ /
106
+ )?
107
+ [a-z]+
108
+ [+-]
109
+ \d{1,4} ## e.g. 0 or 00 or 0000
110
+ \)
111
+ )
112
+ }ix
113
+
114
+
115
+
116
+ GEO_BASICS_RE = %r{
117
+ (?<spaces> [ ]{2,}) |
118
+ (?<space> [ ])
119
+ |
120
+ (?<sym> [,›>\[] )
121
+ }ix
122
+
123
+
124
+
125
+
126
+ GEO_RE = Regexp.union(
127
+ TIMEZONE_RE,
128
+ GEO_BASICS_RE,
129
+ GEO_TEXT_RE,
130
+ ANY_RE,
131
+ )
132
+
133
+ end # class Lexer
134
+ end # module SportDb
@@ -69,10 +69,12 @@ MINUTE_RE = %r{
69
69
  # or others with first matching position
70
70
  # or if chars get eaten-up?
71
71
  # let us know if \G is required here or not
72
+ #
73
+ ## note - use \A (instead of ^) - \A strictly matches the start of the string.
72
74
 
73
75
 
74
76
  PLAYER_WITH_MINUTE_RE = %r{
75
- ^ ### note - MUST start line; leading spaces optional (eat-up)
77
+ \A ### note - MUST start line; leading spaces optional (eat-up)
76
78
  [ ]*
77
79
  (?: # optional open bracket ([) -- remove later
78
80
  (?<open_bracket> \[ )
@@ -143,8 +145,11 @@ PLAYER_WITH_MINUTE_RE = %r{
143
145
  }ix
144
146
 
145
147
 
148
+
149
+ ## note - use \A (instead of ^) - \A strictly matches the start of the string.
150
+
146
151
  PLAYER_WITH_SCORE_RE = %r{
147
- ^ ### note - MUST start line; leading spaces optional (eat-up)
152
+ \A ### note - MUST start line; leading spaces optional (eat-up)
148
153
  [ ]*
149
154
  (?<player_with_score>
150
155
  (?<score>
@@ -2,6 +2,17 @@ module SportDb
2
2
  class Lexer
3
3
 
4
4
 
5
+
6
+ ## todo - use ANY_RE to token_commons or such - for shared by many?
7
+
8
+ ## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
9
+ ## to avoid advance of pos match!!!
10
+ ANY_RE = %r{
11
+ (?<any> .)
12
+ }ix
13
+
14
+
15
+
5
16
  ## note - do NOT allow single alpha text for now
6
17
  ## add later?? A - B C - D - why?
7
18
  ## opt 1) one alpha
@@ -60,34 +71,39 @@ TEXT_RE = %r{
60
71
  1/ \d{1,2} [ ] \p{L}+
61
72
  |
62
73
  ## opt 4 - add another weirdo case
74
+ ## e.g. 's Gravenwezel-Schilde
75
+ '[s]
76
+ |
77
+ ## opt 5 - add another weirdo case
63
78
  ## e.g. 5.-8. Platz Playoffs - keep - why? why not?
64
79
  \d+\.-\d+\. [ ]? \p{L}+
65
80
  )
66
81
 
67
- (?:(?: (?:[ ]
82
+ (?:(?: (?:[ ] # only single spaces allowed inline!!!
68
83
  (?! (?-i: vs?[ ])
69
84
  ) ## note - exclude (v[ ]/vs[ ])
70
85
  ## AND switch to case-sensitive (via -i!!!)
71
86
  )
72
- | # only single spaces allowed inline!!!
73
- [-/]
87
+ |
88
+ [/-] ## must NOT be surrounded by spaces
74
89
  )?
75
90
  (?:
76
- \p{L} |
77
- [&'°]
78
- |
91
+ \p{L}
92
+ |
93
+ [.&'°]
94
+ |
79
95
  (?:
80
96
  \d+
81
97
  (?!
82
- [0-9h'+-] | ## protected break on 12h / 12' / 1-1
83
- ## check usege for 3+4 - possible? where ? why?
84
- (?:[.:]\d) ## protected/exclude/break on 12.03 / 12:03
98
+ [0-9h'+] | ## protected break on 12h / 12' / 1-1
99
+ ## check usege for 3+4 - possible? where ? why?
100
+ (?:[.:-]\d) ## protected/exclude/break on 12.03 / 12:03 / 12-12
101
+ ## BUT allow Park21-Arena for example e.g. 21-A :-)
85
102
  )
86
103
  ## negative lookahead for numbers
87
104
  ## note - include digits itself!!!
88
105
  ## note - remove / (slash) e.g. allows UDI'19/Beter Bed
89
- )|
90
- \.
106
+ )
91
107
  )
92
108
  )* ## must NOT end with space or dash(-)
93
109
  ## todo/fix - possible in regex here
@@ -21,49 +21,6 @@ TIME_RE = %r{
21
21
 
22
22
 
23
23
 
24
- ##
25
- # for timezone format use for now:
26
- # (BRT/UTC-3) (e.g. brazil time)
27
- #
28
- # (CET/UTC+1) - central european time
29
- # (CEST/UTC+2) - central european summer time - daylight saving time (DST).
30
- # (EET/UTC+1) - eastern european time
31
- # (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
32
- #
33
- # UTC+3
34
- # UTC+4
35
- # UTC+0
36
- # UTC+00
37
- # UTC+0000
38
- #
39
- # - allow +01 or +0100 - why? why not
40
- # - +0130 (01:30)
41
- #
42
- # see
43
- # https://en.wikipedia.org/wiki/Time_zone
44
- # https://en.wikipedia.org/wiki/List_of_UTC_offsets
45
- # https://en.wikipedia.org/wiki/UTC−04:00 etc.
46
- #
47
- # e.g. (UTC-2) or (CEST/UTC-2) etc.
48
- # todo check - only allow upcase
49
- # or (utc-2) and (cest/utc-2) too - why? why not?
50
-
51
- TIMEZONE_RE = %r{
52
- (?<timezone>
53
- \(
54
- ## optional "local" timezone name eg. BRT or CEST etc.
55
- (?: [a-z]+
56
- /
57
- )?
58
- [a-z]+
59
- [+-]
60
- \d{1,4} ## e.g. 0 or 00 or 0000
61
- \)
62
- )
63
- }ix
64
-
65
-
66
-
67
24
  ## add wday / stand-alone week day - as separate regex or
68
25
  ## use TEXT with is_wday? check or such with
69
26
  ## requirement of beginning of line (anchored to line) only??
@@ -124,12 +81,6 @@ BASICS_RE = %r{
124
81
  }ix
125
82
 
126
83
 
127
- ## general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
128
- ## to avoid advance of pos match!!!
129
- ANY_RE = %r{
130
- (?<any> .)
131
- }ix
132
-
133
84
 
134
85
  RE = Regexp.union(
135
86
  STATUS_RE,
@@ -148,21 +99,6 @@ RE = Regexp.union(
148
99
  )
149
100
 
150
101
 
151
- GEO_BASICS_RE = %r{
152
- (?<spaces> [ ]{2,}) |
153
- (?<space> [ ])
154
- |
155
- (?<sym> [,›>\[] )
156
- }ix
157
-
158
-
159
- GEO_RE = Regexp.union(
160
- TIMEZONE_RE,
161
- GEO_BASICS_RE,
162
- TEXT_RE,
163
- ANY_RE,
164
- )
165
-
166
102
 
167
103
  ######################################################
168
104
  ## goal mode (switched to by PLAYER_WITH_MINUTE_RE)
@@ -199,7 +135,8 @@ PROP_GOAL_RE = Regexp.union(
199
135
 
200
136
  ####
201
137
  #
202
- ROUND_OUTLINE_RE = %r{ ^
138
+ ## note - use \A (instead of ^) - \A strictly matches the start of the string.
139
+ ROUND_OUTLINE_RE = %r{ \A
203
140
  [ ]* ## ignore leading spaces (if any)
204
141
  (?: »|>> )
205
142
  [ ]+
@@ -4,7 +4,7 @@ module SportDb
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
6
  MINOR = 6
7
- PATCH = 16
7
+ PATCH = 18
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -23,6 +23,7 @@ require_relative 'parser/token-text'
23
23
  require_relative 'parser/token-status'
24
24
  require_relative 'parser/token-minute'
25
25
  require_relative 'parser/token-prop' ## team prop(erty) mode (note - must be before token)
26
+ require_relative 'parser/token-geo'
26
27
  require_relative 'parser/token'
27
28
  require_relative 'parser/lexer'
28
29
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.16
4
+ version: 0.6.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-03-09 00:00:00.000000000 Z
11
+ date: 2025-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos
@@ -104,6 +104,7 @@ files:
104
104
  - lib/sportdb/parser/racc_parser.rb
105
105
  - lib/sportdb/parser/racc_tree.rb
106
106
  - lib/sportdb/parser/token-date.rb
107
+ - lib/sportdb/parser/token-geo.rb
107
108
  - lib/sportdb/parser/token-minute.rb
108
109
  - lib/sportdb/parser/token-prop.rb
109
110
  - lib/sportdb/parser/token-score.rb