sportdb-parser 0.3.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -4
- data/README.md +0 -5
- data/Rakefile +1 -0
- data/lib/sportdb/parser/parser.rb +631 -212
- data/lib/sportdb/parser/token-text.rb +1 -1
- data/lib/sportdb/parser/token.rb +58 -56
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +238 -7
- metadata +20 -11
- data/bin/fbtok +0 -13
- data/lib/sportdb/parser/fbtok/main.rb +0 -141
- data/lib/sportdb/parser/linter.rb +0 -156
- data/lib/sportdb/parser/opts.rb +0 -81
@@ -1,141 +0,0 @@
|
|
1
|
-
|
2
|
-
module Fbtok
|
3
|
-
def self.main( args=ARGV )
|
4
|
-
|
5
|
-
opts = {
|
6
|
-
debug: true,
|
7
|
-
metal: false,
|
8
|
-
file: nil,
|
9
|
-
}
|
10
|
-
|
11
|
-
parser = OptionParser.new do |parser|
|
12
|
-
parser.banner = "Usage: #{$PROGRAM_NAME} [options] PATH"
|
13
|
-
|
14
|
-
|
15
|
-
parser.on( "-q", "--quiet",
|
16
|
-
"less debug output/messages - default is (#{!opts[:debug]})" ) do |debug|
|
17
|
-
opts[:debug] = false
|
18
|
-
end
|
19
|
-
parser.on( "--verbose", "--debug",
|
20
|
-
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
21
|
-
opts[:debug] = true
|
22
|
-
end
|
23
|
-
|
24
|
-
parser.on( "--metal",
|
25
|
-
"turn off typed parse tree; show to the metal tokens"+
|
26
|
-
" (default: #{opts[:metal]})" ) do |metal|
|
27
|
-
opts[:metal] = true
|
28
|
-
end
|
29
|
-
|
30
|
-
parser.on( "-f FILE", "--file FILE",
|
31
|
-
"read datafiles (pathspecs) via .csv file") do |file|
|
32
|
-
opts[:file] = file
|
33
|
-
## note: for batch (massive) processing auto-set debug (verbose output) to false (as default)
|
34
|
-
opts[:debug] = false
|
35
|
-
end
|
36
|
-
end
|
37
|
-
parser.parse!( args )
|
38
|
-
|
39
|
-
puts "OPTS:"
|
40
|
-
p opts
|
41
|
-
puts "ARGV:"
|
42
|
-
p args
|
43
|
-
|
44
|
-
|
45
|
-
## todo/check - use packs or projects or such
|
46
|
-
## instead of specs - why? why not?
|
47
|
-
specs = []
|
48
|
-
if opts[:file]
|
49
|
-
recs = read_csv( opts[:file] )
|
50
|
-
pp recs
|
51
|
-
## note - make pathspecs relative to passed in file arg!!!
|
52
|
-
basedir = File.dirname( opts[:file] )
|
53
|
-
recs.each do |rec|
|
54
|
-
paths = SportDb::Parser::Opts.find( rec['path'], dir: basedir )
|
55
|
-
specs << [paths, rec]
|
56
|
-
end
|
57
|
-
else
|
58
|
-
paths = if args.empty?
|
59
|
-
[
|
60
|
-
'../../../openfootball/euro/2021--europe/euro.txt',
|
61
|
-
'../../../openfootball/euro/2024--germany/euro.txt',
|
62
|
-
]
|
63
|
-
else
|
64
|
-
## check for directories
|
65
|
-
## and auto-expand
|
66
|
-
SportDb::Parser::Opts.expand_args( args )
|
67
|
-
end
|
68
|
-
specs << [paths, {}]
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
SportDb::Parser::Linter.debug = true if opts[:debug]
|
73
|
-
|
74
|
-
linter = SportDb::Parser::Linter.new
|
75
|
-
|
76
|
-
|
77
|
-
specs.each_with_index do |(paths, rec),i|
|
78
|
-
errors = []
|
79
|
-
|
80
|
-
paths.each_with_index do |path,j|
|
81
|
-
puts "==> [#{j+1}/#{paths.size}] reading >#{path}<..."
|
82
|
-
linter.read( path, parse: !opts[:metal] )
|
83
|
-
|
84
|
-
errors += linter.errors if linter.errors?
|
85
|
-
end
|
86
|
-
|
87
|
-
if errors.size > 0
|
88
|
-
puts
|
89
|
-
pp errors
|
90
|
-
puts
|
91
|
-
puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
92
|
-
else
|
93
|
-
puts
|
94
|
-
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
95
|
-
end
|
96
|
-
|
97
|
-
## add errors to rec via rec['errors'] to allow
|
98
|
-
## for further processing/reporting
|
99
|
-
rec['errors'] = errors
|
100
|
-
end
|
101
|
-
|
102
|
-
|
103
|
-
###
|
104
|
-
## generate a report if --file option used
|
105
|
-
if opts[:file]
|
106
|
-
|
107
|
-
buf = String.new
|
108
|
-
|
109
|
-
buf << "# fbtok summary report - #{specs.size} dataset(s)\n\n"
|
110
|
-
|
111
|
-
specs.each_with_index do |(paths, rec),i|
|
112
|
-
errors = rec['errors']
|
113
|
-
|
114
|
-
if errors.size > 0
|
115
|
-
buf << "!! #{errors.size} ERROR(S) "
|
116
|
-
else
|
117
|
-
buf << " OK "
|
118
|
-
end
|
119
|
-
buf << "%-20s" % rec['path']
|
120
|
-
buf << " - #{paths.size} datafile(s)"
|
121
|
-
buf << "\n"
|
122
|
-
|
123
|
-
if errors.size > 0
|
124
|
-
buf << errors.pretty_inspect
|
125
|
-
buf << "\n"
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
puts
|
130
|
-
puts "SUMMARY:"
|
131
|
-
puts buf
|
132
|
-
|
133
|
-
# maybe write out in the future?
|
134
|
-
# basedir = File.dirname( opts[:file] )
|
135
|
-
# basename = File.basename( opts[:file], File.extname( opts[:file] ))
|
136
|
-
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
end # method self.main
|
141
|
-
end # module Fbtok
|
@@ -1,156 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
###
|
6
|
-
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
-
class Linter
|
8
|
-
|
9
|
-
def self.debug=(value) @@debug = value; end
|
10
|
-
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
-
def debug?() self.class.debug?; end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
attr_reader :errors
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@errors = []
|
19
|
-
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
def errors?() @errors.size > 0; end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
## note: colon (:) MUST be followed by one (or more) spaces
|
28
|
-
## make sure mon feb 12 18:10 will not match
|
29
|
-
## allow 1. FC Köln etc.
|
30
|
-
## Mainz 05:
|
31
|
-
## limit to 30 chars max
|
32
|
-
## only allow chars incl. intl buut (NOT ()[]/;)
|
33
|
-
##
|
34
|
-
## Group A:
|
35
|
-
## Group B: - remove colon
|
36
|
-
## or lookup first
|
37
|
-
|
38
|
-
ATTRIB_RE = %r{^
|
39
|
-
[ ]*? # slurp leading spaces
|
40
|
-
(?<key>[^:|\]\[()\/; -]
|
41
|
-
[^:|\]\[()\/;]{0,30}
|
42
|
-
)
|
43
|
-
[ ]*? # slurp trailing spaces
|
44
|
-
:[ ]+
|
45
|
-
(?<value>.+)
|
46
|
-
[ ]*? # slurp trailing spaces
|
47
|
-
$
|
48
|
-
}ix
|
49
|
-
|
50
|
-
|
51
|
-
#########
|
52
|
-
## parse - false (default) - tokenize (only)
|
53
|
-
## - true - tokenize & parse
|
54
|
-
def read( path, parse: false )
|
55
|
-
## note: every (new) read call - resets errors list to empty
|
56
|
-
@errors = []
|
57
|
-
|
58
|
-
nodes = OutlineReader.read( path )
|
59
|
-
|
60
|
-
## process nodes
|
61
|
-
h1 = nil
|
62
|
-
h2 = nil
|
63
|
-
orphans = 0 ## track paragraphs's with no heading
|
64
|
-
|
65
|
-
attrib_found = false
|
66
|
-
|
67
|
-
|
68
|
-
nodes.each do |node|
|
69
|
-
type = node[0]
|
70
|
-
|
71
|
-
if type == :h1
|
72
|
-
h1 = node[1] ## get heading text
|
73
|
-
puts " = Heading 1 >#{node[1]}<"
|
74
|
-
elsif type == :h2
|
75
|
-
if h1.nil?
|
76
|
-
puts "!! WARN - no heading for subheading; skipping parse"
|
77
|
-
next
|
78
|
-
end
|
79
|
-
h2 = node[1] ## get heading text
|
80
|
-
puts " == Heading 2 >#{node[1]}<"
|
81
|
-
elsif type == :p
|
82
|
-
|
83
|
-
if h1.nil?
|
84
|
-
orphans += 1 ## only warn once
|
85
|
-
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
86
|
-
next
|
87
|
-
end
|
88
|
-
|
89
|
-
lines = node[1]
|
90
|
-
|
91
|
-
tree = []
|
92
|
-
lines.each_with_index do |line,i|
|
93
|
-
|
94
|
-
if debug?
|
95
|
-
puts
|
96
|
-
puts "line >#{line}<"
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
## skip new (experimental attrib syntax)
|
101
|
-
if attrib_found == false &&
|
102
|
-
ATTRIB_RE.match?( line )
|
103
|
-
## note: check attrib regex AFTER group def e.g.:
|
104
|
-
## Group A:
|
105
|
-
## Group B: etc.
|
106
|
-
## todo/fix - change Group A: to Group A etc.
|
107
|
-
## Group B: to Group B
|
108
|
-
attrib_found = true
|
109
|
-
## logger.debug "skipping key/value line - >#{line}<"
|
110
|
-
next
|
111
|
-
end
|
112
|
-
|
113
|
-
if attrib_found
|
114
|
-
## check if line ends with dot
|
115
|
-
## if not slurp up lines to the next do!!!
|
116
|
-
## logger.debug "skipping key/value line - >#{line}<"
|
117
|
-
attrib_found = false if line.end_with?( '.' )
|
118
|
-
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
119
|
-
next
|
120
|
-
end
|
121
|
-
|
122
|
-
t, error_messages = if parse
|
123
|
-
@parser.parse_with_errors( line )
|
124
|
-
else
|
125
|
-
@parser.tokenize_with_errors( line )
|
126
|
-
end
|
127
|
-
|
128
|
-
|
129
|
-
if error_messages.size > 0
|
130
|
-
## add to "global" error list
|
131
|
-
## make a triplet tuple (file / msg / line text)
|
132
|
-
error_messages.each do |msg|
|
133
|
-
@errors << [ path,
|
134
|
-
msg,
|
135
|
-
line
|
136
|
-
]
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
pp t if debug?
|
141
|
-
|
142
|
-
tree << t
|
143
|
-
end
|
144
|
-
|
145
|
-
## pp tree
|
146
|
-
else
|
147
|
-
pp node
|
148
|
-
raise ArgumentError, "unsupported (node) type >#{type}<"
|
149
|
-
end
|
150
|
-
end # each node
|
151
|
-
end # read
|
152
|
-
end # class Linter
|
153
|
-
|
154
|
-
|
155
|
-
end # class Parser
|
156
|
-
end # module SportDb
|
data/lib/sportdb/parser/opts.rb
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
|
6
|
-
###
|
7
|
-
## note - Opts Helpers for now nested inside Parser - keep here? why? why not?
|
8
|
-
class Opts
|
9
|
-
|
10
|
-
SEASON_RE = %r{ (?:
|
11
|
-
\d{4}-\d{2}
|
12
|
-
| \d{4}(--[a-z0-9_-]+)?
|
13
|
-
)
|
14
|
-
}x
|
15
|
-
SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
|
16
|
-
|
17
|
-
|
18
|
-
## note: if pattern includes directory add here
|
19
|
-
## (otherwise move to more "generic" datafile) - why? why not?
|
20
|
-
## update - note include/allow dot (.) too
|
21
|
-
## e.g. 2024-25/at.1.txt
|
22
|
-
## change to at_1 or uefa_cl or such - why? why not?
|
23
|
-
MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
24
|
-
#{SEASON}
|
25
|
-
/[a-z0-9_.-]+\.txt$ ## txt e.g /1-premierleague.txt
|
26
|
-
}x
|
27
|
-
|
28
|
-
|
29
|
-
def self.find( path, dir: nil )
|
30
|
-
## check - rename dir
|
31
|
-
## use root_dir or work_dir or cd or such - why? why not?
|
32
|
-
|
33
|
-
datafiles = []
|
34
|
-
|
35
|
-
## note: normalize path - use File.expand_path ??
|
36
|
-
## change all backslash to slash for now
|
37
|
-
## path = path.gsub( "\\", '/' )
|
38
|
-
path = if dir
|
39
|
-
File.expand_path( path, File.expand_path( dir ))
|
40
|
-
else
|
41
|
-
File.expand_path( path )
|
42
|
-
end
|
43
|
-
|
44
|
-
## check all txt files
|
45
|
-
## note: incl. files starting with dot (.)) as candidates
|
46
|
-
## (normally excluded with just *)
|
47
|
-
candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
|
48
|
-
## pp candidates
|
49
|
-
candidates.each do |candidate|
|
50
|
-
datafiles << candidate if MATCH_RE.match( candidate )
|
51
|
-
end
|
52
|
-
|
53
|
-
## pp datafiles
|
54
|
-
datafiles
|
55
|
-
end
|
56
|
-
|
57
|
-
|
58
|
-
def self.expand_args( args )
|
59
|
-
paths = []
|
60
|
-
|
61
|
-
args.each do |arg|
|
62
|
-
## check if directory
|
63
|
-
if Dir.exist?( arg )
|
64
|
-
datafiles = find( arg )
|
65
|
-
puts
|
66
|
-
puts " found #{datafiles.size} match txt datafiles in #{arg}"
|
67
|
-
pp datafiles
|
68
|
-
paths += datafiles
|
69
|
-
else
|
70
|
-
## assume it's a file
|
71
|
-
paths << arg
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
paths
|
76
|
-
end
|
77
|
-
end # class Opts
|
78
|
-
|
79
|
-
|
80
|
-
end # class Parser
|
81
|
-
end # module SportDb
|