sportdb-parser 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +1 -2
- data/bin/{fbt → fbtok} +10 -37
- data/lib/sportdb/parser/linter.rb +18 -18
- data/lib/sportdb/parser/token-date.rb +29 -0
- data/lib/sportdb/parser/token-text.rb +50 -40
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +3 -3
- metadata +4 -5
- data/lib/sportdb/parser/opts.rb +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ebb468318f2b87c33ca66afb6c46611ce5f420258e0c41b40a2cbfabcff7a49
|
4
|
+
data.tar.gz: 0cf1d511f3e936d73531442d1ca6bef94d90a50ae65346b5b57347d4d294dc77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e992ab97d7ae18c514de14078d30eb36adc40f5044242ce9ace089fb88b104c61b29ff86a2aa8101bb7257c3ff2ce32c6150439ff855e195bee1b26032bb0d9d
|
7
|
+
data.tar.gz: 25e66e45e7daf2783bc6507a3cb2c660d9153eab9530210ef51ef6e0d5d3fc531e5891897be3b0492b0ad7ea5fe3d406a0a3dd0559549b85518360d442ed4d8b
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -2,11 +2,10 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
-
bin/
|
5
|
+
bin/fbtok
|
6
6
|
lib/sportdb/parser.rb
|
7
7
|
lib/sportdb/parser/lang.rb
|
8
8
|
lib/sportdb/parser/linter.rb
|
9
|
-
lib/sportdb/parser/opts.rb
|
10
9
|
lib/sportdb/parser/outline_reader.rb
|
11
10
|
lib/sportdb/parser/parser.rb
|
12
11
|
lib/sportdb/parser/token-date.rb
|
data/bin/{fbt → fbtok}
RENAMED
@@ -1,36 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
## tip: to test run:
|
4
|
-
## ruby -I ./lib bin/
|
4
|
+
## ruby -I ./lib bin/fbtok
|
5
5
|
|
6
|
-
## our own code
|
7
6
|
require 'sportdb/parser'
|
8
7
|
|
9
8
|
|
9
|
+
require 'optparse' ## check - already auto-required in cocos? keep? why? why not?
|
10
10
|
|
11
|
-
require 'optparse'
|
12
11
|
|
13
|
-
|
14
|
-
## read textfile
|
15
|
-
## and dump tokens
|
16
|
-
##
|
17
|
-
## fbt ../openfootball/.../euro.txt
|
12
|
+
args=ARGV
|
18
13
|
|
19
14
|
|
15
|
+
opts = {
|
16
|
+
debug: true,
|
17
|
+
metal: false,
|
18
|
+
}
|
20
19
|
|
21
|
-
|
22
|
-
args = ARGV
|
23
|
-
opts = { debug: false,
|
24
|
-
metal: false }
|
25
|
-
|
26
|
-
parser = OptionParser.new do |parser|
|
20
|
+
parser = OptionParser.new do |parser|
|
27
21
|
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
28
22
|
|
29
|
-
##
|
30
|
-
## check if git has a offline option?? (use same)
|
31
|
-
## check for other tools - why? why not?
|
32
|
-
|
33
|
-
|
34
23
|
parser.on( "--verbose", "--debug",
|
35
24
|
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
36
25
|
opts[:debug] = debug
|
@@ -50,29 +39,12 @@ puts "ARGV:"
|
|
50
39
|
p args
|
51
40
|
|
52
41
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
paths = if args.empty?
|
57
|
-
[
|
58
|
-
'../../../openfootball/euro/2021--europe/euro.txt',
|
59
|
-
'../../../openfootball/euro/2024--germany/euro.txt',
|
60
|
-
]
|
61
|
-
else
|
62
|
-
## check for directories
|
63
|
-
## and auto-expand
|
64
|
-
|
65
|
-
SportDb::Parser::Opts.expand_args( args )
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
42
|
SportDb::Parser::Linter.debug = true if opts[:debug]
|
71
43
|
|
72
44
|
linter = SportDb::Parser::Linter.new
|
73
|
-
|
74
45
|
errors = []
|
75
46
|
|
47
|
+
paths = args
|
76
48
|
paths.each_with_index do |path,i|
|
77
49
|
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
78
50
|
linter.read( path, parse: !opts[:metal] )
|
@@ -90,5 +62,6 @@ else
|
|
90
62
|
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
91
63
|
end
|
92
64
|
|
65
|
+
|
93
66
|
puts "bye"
|
94
67
|
|
@@ -5,11 +5,11 @@ class Parser
|
|
5
5
|
###
|
6
6
|
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
7
|
class Linter
|
8
|
-
|
8
|
+
|
9
9
|
def self.debug=(value) @@debug = value; end
|
10
10
|
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
-
def debug?() self.class.debug?; end
|
12
|
-
|
11
|
+
def debug?() self.class.debug?; end
|
12
|
+
|
13
13
|
|
14
14
|
|
15
15
|
attr_reader :errors
|
@@ -35,7 +35,7 @@ def errors?() @errors.size > 0; end
|
|
35
35
|
## Group B: - remove colon
|
36
36
|
## or lookup first
|
37
37
|
|
38
|
-
ATTRIB_RE = %r{^
|
38
|
+
ATTRIB_RE = %r{^
|
39
39
|
[ ]*? # slurp leading spaces
|
40
40
|
(?<key>[^:|\]\[()\/; -]
|
41
41
|
[^:|\]\[()\/;]{0,30}
|
@@ -50,12 +50,12 @@ def errors?() @errors.size > 0; end
|
|
50
50
|
|
51
51
|
#########
|
52
52
|
## parse - false (default) - tokenize (only)
|
53
|
-
## - true - tokenize & parse
|
53
|
+
## - true - tokenize & parse
|
54
54
|
def read( path, parse: false )
|
55
55
|
## note: every (new) read call - resets errors list to empty
|
56
56
|
@errors = []
|
57
57
|
|
58
|
-
nodes = OutlineReader.read( path )
|
58
|
+
nodes = OutlineReader.read( path )
|
59
59
|
|
60
60
|
## process nodes
|
61
61
|
h1 = nil
|
@@ -66,7 +66,7 @@ def read( path, parse: false )
|
|
66
66
|
|
67
67
|
nodes.each do |node|
|
68
68
|
type = node[0]
|
69
|
-
|
69
|
+
|
70
70
|
if type == :h1
|
71
71
|
h1 = node[1] ## get heading text
|
72
72
|
puts
|
@@ -74,14 +74,14 @@ def read( path, parse: false )
|
|
74
74
|
elsif type == :p
|
75
75
|
|
76
76
|
if h1.nil?
|
77
|
-
orphans += 1 ## only warn once
|
77
|
+
orphans += 1 ## only warn once
|
78
78
|
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
79
79
|
next
|
80
80
|
end
|
81
81
|
|
82
82
|
lines = node[1]
|
83
83
|
|
84
|
-
tree = []
|
84
|
+
tree = []
|
85
85
|
lines.each_with_index do |line,i|
|
86
86
|
|
87
87
|
if debug?
|
@@ -91,10 +91,10 @@ def read( path, parse: false )
|
|
91
91
|
|
92
92
|
|
93
93
|
## skip new (experimental attrib syntax)
|
94
|
-
if attrib_found == false &&
|
94
|
+
if attrib_found == false &&
|
95
95
|
ATTRIB_RE.match?( line )
|
96
96
|
## note: check attrib regex AFTER group def e.g.:
|
97
|
-
## Group A:
|
97
|
+
## Group A:
|
98
98
|
## Group B: etc.
|
99
99
|
## todo/fix - change Group A: to Group A etc.
|
100
100
|
## Group B: to Group B
|
@@ -107,17 +107,17 @@ def read( path, parse: false )
|
|
107
107
|
## check if line ends with dot
|
108
108
|
## if not slurp up lines to the next do!!!
|
109
109
|
## logger.debug "skipping key/value line - >#{line}<"
|
110
|
-
attrib_found = false if line.end_with?( '.' )
|
110
|
+
attrib_found = false if line.end_with?( '.' )
|
111
111
|
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
112
112
|
next
|
113
|
-
end
|
114
|
-
|
113
|
+
end
|
114
|
+
|
115
115
|
t, error_messages = if parse
|
116
116
|
@parser.parse_with_errors( line )
|
117
117
|
else
|
118
|
-
@parser.tokenize_with_errors( line )
|
118
|
+
@parser.tokenize_with_errors( line )
|
119
119
|
end
|
120
|
-
|
120
|
+
|
121
121
|
|
122
122
|
if error_messages.size > 0
|
123
123
|
## add to "global" error list
|
@@ -134,7 +134,7 @@ def read( path, parse: false )
|
|
134
134
|
|
135
135
|
tree << t
|
136
136
|
end
|
137
|
-
|
137
|
+
|
138
138
|
## pp tree
|
139
139
|
else
|
140
140
|
pp node
|
@@ -146,4 +146,4 @@ end # class Linter
|
|
146
146
|
|
147
147
|
|
148
148
|
end # class Parser
|
149
|
-
end # module SportDb
|
149
|
+
end # module SportDb
|
@@ -155,6 +155,35 @@ DATE_RE = Regexp.union(
|
|
155
155
|
)
|
156
156
|
|
157
157
|
|
158
|
+
##
|
159
|
+
## add a date parser helper
|
160
|
+
def self.parse_date( str, start: )
|
161
|
+
if m=DATE_RE.match( str )
|
162
|
+
|
163
|
+
year = m[:year].to_i(10) if m[:year]
|
164
|
+
month = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
165
|
+
day = m[:day].to_i(10) if m[:day]
|
166
|
+
wday = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
167
|
+
|
168
|
+
if year.nil? ## try to calculate year
|
169
|
+
year = if month > start.month ||
|
170
|
+
(month == start.month && day >= start.day)
|
171
|
+
# assume same year as start_at event (e.g. 2013 for 2013/14 season)
|
172
|
+
start.year
|
173
|
+
else
|
174
|
+
# assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
|
175
|
+
start.year+1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
Date.new( year,month,day )
|
179
|
+
else
|
180
|
+
puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
|
181
|
+
exit 1
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
|
186
|
+
|
158
187
|
###
|
159
188
|
# date duration
|
160
189
|
# use - or + as separator
|
@@ -1,12 +1,12 @@
|
|
1
|
-
module SportDb
|
1
|
+
module SportDb
|
2
2
|
class Parser
|
3
|
-
|
4
|
-
|
3
|
+
|
4
|
+
|
5
5
|
## note - do NOT allow single alpha text for now
|
6
|
-
## add later?? A - B C - D - why?
|
6
|
+
## add later?? A - B C - D - why?
|
7
7
|
## opt 1) one alpha
|
8
|
-
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
-
|
8
|
+
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
+
|
10
10
|
## opt 2) more than one alphanum
|
11
11
|
|
12
12
|
|
@@ -26,19 +26,19 @@ class Parser
|
|
26
26
|
|
27
27
|
|
28
28
|
TEXT_RE = %r{
|
29
|
-
## must start with alpha (allow unicode letters!!)
|
30
|
-
(?<text>
|
31
|
-
## positive lookbehind
|
29
|
+
## must start with alpha (allow unicode letters!!)
|
30
|
+
(?<text>
|
31
|
+
## positive lookbehind
|
32
32
|
## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
|
33
33
|
(?<=[ ,;@|\[\]]
|
34
34
|
|^
|
35
35
|
)
|
36
|
-
(?:
|
36
|
+
(?:
|
37
37
|
# opt 1 - start with alpha
|
38
38
|
\p{L}+ ## all unicode letters (e.g. [a-z])
|
39
39
|
|
|
40
40
|
|
41
|
-
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
41
|
+
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
42
42
|
\d+ # check for num lookahead (MUST be space or dot)
|
43
43
|
## MUST be followed by (optional dot) and
|
44
44
|
## required space !!!
|
@@ -46,69 +46,79 @@ TEXT_RE = %r{
|
|
46
46
|
\.? ## optional dot
|
47
47
|
[ ]? ## make space optional too - why? why not?
|
48
48
|
## yes - eg. 1st, 2nd, 5th etc.
|
49
|
-
\p{L}+
|
49
|
+
\p{L}+
|
50
50
|
)
|
51
|
-
|
51
|
+
|
52
52
|
(?:(?: (?:[ ]
|
53
53
|
(?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
|
54
|
-
)
|
54
|
+
)
|
55
55
|
| # only single spaces allowed inline!!!
|
56
|
-
[-]
|
56
|
+
[-]
|
57
57
|
)?
|
58
58
|
(?:
|
59
59
|
\p{L} |
|
60
|
-
[&/']
|
60
|
+
[&/']
|
61
61
|
|
|
62
62
|
(?:
|
63
|
-
\d+
|
64
|
-
(?![0-9.:h'/+-])
|
63
|
+
\d+
|
64
|
+
(?![0-9.:h'/+-])
|
65
65
|
## negative lookahead for numbers
|
66
66
|
## note - include digits itself!!!
|
67
|
-
)|
|
68
|
-
\.
|
69
|
-
)
|
67
|
+
)|
|
68
|
+
\.
|
69
|
+
)
|
70
70
|
)* ## must NOT end with space or dash(-)
|
71
71
|
## todo/fix - possible in regex here
|
72
72
|
## only end in alphanum a-z0-9 (not dot or & ???)
|
73
73
|
|
74
|
-
|
74
|
+
|
75
75
|
## allow optional at the end
|
76
76
|
## tag or year
|
77
|
-
## make it and in the future - why? why not?
|
78
|
-
##
|
77
|
+
## make it and in the future - why? why not?
|
78
|
+
##
|
79
|
+
## change - fix
|
80
|
+
## do NOT use (A) for amateur
|
81
|
+
## use A or A. with NO ()!!!
|
79
82
|
## (A) - allow with predined alpha only for now
|
80
83
|
## e.g. (A) - amateur a team or b?
|
84
|
+
### same for U21 or U9 etc
|
85
|
+
## use with NO ()!!! - why? why not?
|
81
86
|
## or U21 U9 etc. - why? why not?
|
82
87
|
## or etc.
|
83
88
|
## (1879-1893) or allow years e.g. (1879-1893)
|
84
|
-
###
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
###
|
90
|
+
## add allow country code three to five letters for now
|
91
|
+
## change to generic 1 to 5 - why? why not?
|
92
|
+
## e.g. (A), (I),
|
93
|
+
## (AUT)
|
94
|
+
## (TRNC) five? for UEFA code for northern cyprus
|
95
|
+
## change to 1 to 4 - why? why not?
|
96
|
+
## check - fix possible for upper case only here
|
97
|
+
## inline for this group only?
|
93
98
|
(?:
|
94
|
-
[ ]
|
99
|
+
[ ]
|
95
100
|
\(
|
96
101
|
\d{4}-\d{4}
|
97
102
|
\)
|
98
|
-
)?
|
99
|
-
|
103
|
+
)?
|
104
|
+
(?:
|
105
|
+
[ ]+ ## allow more than once space - why? why not?
|
106
|
+
\( (?:
|
107
|
+
[A-Z]{1,5}
|
108
|
+
)
|
109
|
+
\)
|
110
|
+
)?
|
100
111
|
## add lookahead/lookbehind
|
101
|
-
## must be space!!!
|
112
|
+
## must be space!!!
|
102
113
|
## (or comma or start/end of string)
|
103
114
|
## kind of \b !!!
|
104
115
|
## positive lookahead
|
105
116
|
(?=[ ,;@|\[\]]
|
106
117
|
|$
|
107
118
|
)
|
108
|
-
)
|
119
|
+
)
|
109
120
|
}ix
|
110
121
|
|
111
122
|
|
112
123
|
end # class Parser
|
113
|
-
end # module SportDb
|
114
|
-
|
124
|
+
end # module SportDb
|
data/lib/sportdb/parser.rb
CHANGED
@@ -24,11 +24,11 @@ require_relative 'parser/lang'
|
|
24
24
|
require_relative 'parser/parser'
|
25
25
|
|
26
26
|
|
27
|
-
|
27
|
+
####
|
28
|
+
## todo/check - move outline reader upstream to cocos - why? why not?
|
29
|
+
## use read_outline(), parse_outline() - why? why not?
|
28
30
|
require_relative 'parser/outline_reader'
|
29
31
|
require_relative 'parser/linter'
|
30
|
-
require_relative 'parser/opts'
|
31
|
-
|
32
32
|
|
33
33
|
###
|
34
34
|
# make parser api (easily) available - why? why not?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -75,7 +75,7 @@ dependencies:
|
|
75
75
|
description: sportdb-parser - football.txt match parser (& tokenizer)
|
76
76
|
email: gerald.bauer@gmail.com
|
77
77
|
executables:
|
78
|
-
-
|
78
|
+
- fbtok
|
79
79
|
extensions: []
|
80
80
|
extra_rdoc_files:
|
81
81
|
- CHANGELOG.md
|
@@ -86,11 +86,10 @@ files:
|
|
86
86
|
- Manifest.txt
|
87
87
|
- README.md
|
88
88
|
- Rakefile
|
89
|
-
- bin/
|
89
|
+
- bin/fbtok
|
90
90
|
- lib/sportdb/parser.rb
|
91
91
|
- lib/sportdb/parser/lang.rb
|
92
92
|
- lib/sportdb/parser/linter.rb
|
93
|
-
- lib/sportdb/parser/opts.rb
|
94
93
|
- lib/sportdb/parser/outline_reader.rb
|
95
94
|
- lib/sportdb/parser/parser.rb
|
96
95
|
- lib/sportdb/parser/token-date.rb
|
data/lib/sportdb/parser/opts.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
###
|
6
|
-
## note - Opts Helpers for now nested inside Parser - keep here? why? why not?
|
7
|
-
class Opts
|
8
|
-
|
9
|
-
SEASON_RE = %r{ (?:
|
10
|
-
\d{4}-\d{2}
|
11
|
-
| \d{4}(--[a-z0-9_-]+)?
|
12
|
-
)
|
13
|
-
}x
|
14
|
-
SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
|
15
|
-
|
16
|
-
|
17
|
-
## note: if pattern includes directory add here
|
18
|
-
## (otherwise move to more "generic" datafile) - why? why not?
|
19
|
-
MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
20
|
-
#{SEASON}
|
21
|
-
/[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
|
22
|
-
}x
|
23
|
-
|
24
|
-
|
25
|
-
def self.find( path )
|
26
|
-
datafiles = []
|
27
|
-
|
28
|
-
## note: normalize path - use File.expand_path ??
|
29
|
-
## change all backslash to slash for now
|
30
|
-
## path = path.gsub( "\\", '/' )
|
31
|
-
path = File.expand_path( path )
|
32
|
-
|
33
|
-
## check all txt files
|
34
|
-
## note: incl. files starting with dot (.)) as candidates
|
35
|
-
## (normally excluded with just *)
|
36
|
-
candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
|
37
|
-
## pp candidates
|
38
|
-
candidates.each do |candidate|
|
39
|
-
datafiles << candidate if MATCH_RE.match( candidate )
|
40
|
-
end
|
41
|
-
|
42
|
-
## pp datafiles
|
43
|
-
datafiles
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
def self.expand_args( args )
|
48
|
-
paths = []
|
49
|
-
|
50
|
-
args.each do |arg|
|
51
|
-
## check if directory
|
52
|
-
if Dir.exist?( arg )
|
53
|
-
datafiles = find( arg )
|
54
|
-
puts
|
55
|
-
puts " found #{datafiles.size} match txt datafiles in #{arg}"
|
56
|
-
pp datafiles
|
57
|
-
paths += datafiles
|
58
|
-
else
|
59
|
-
## assume it's a file
|
60
|
-
paths << arg
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
paths
|
65
|
-
end
|
66
|
-
end # class Opts
|
67
|
-
|
68
|
-
|
69
|
-
end # class Parser
|
70
|
-
end # module SportDb
|