sportdb-parser 0.3.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,141 +0,0 @@
1
-
2
- module Fbtok
3
- def self.main( args=ARGV )
4
-
5
- opts = {
6
- debug: true,
7
- metal: false,
8
- file: nil,
9
- }
10
-
11
- parser = OptionParser.new do |parser|
12
- parser.banner = "Usage: #{$PROGRAM_NAME} [options] PATH"
13
-
14
-
15
- parser.on( "-q", "--quiet",
16
- "less debug output/messages - default is (#{!opts[:debug]})" ) do |debug|
17
- opts[:debug] = false
18
- end
19
- parser.on( "--verbose", "--debug",
20
- "turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
21
- opts[:debug] = true
22
- end
23
-
24
- parser.on( "--metal",
25
- "turn off typed parse tree; show to the metal tokens"+
26
- " (default: #{opts[:metal]})" ) do |metal|
27
- opts[:metal] = true
28
- end
29
-
30
- parser.on( "-f FILE", "--file FILE",
31
- "read datafiles (pathspecs) via .csv file") do |file|
32
- opts[:file] = file
33
- ## note: for batch (massive) processing auto-set debug (verbose output) to false (as default)
34
- opts[:debug] = false
35
- end
36
- end
37
- parser.parse!( args )
38
-
39
- puts "OPTS:"
40
- p opts
41
- puts "ARGV:"
42
- p args
43
-
44
-
45
- ## todo/check - use packs or projects or such
46
- ## instead of specs - why? why not?
47
- specs = []
48
- if opts[:file]
49
- recs = read_csv( opts[:file] )
50
- pp recs
51
- ## note - make pathspecs relative to passed in file arg!!!
52
- basedir = File.dirname( opts[:file] )
53
- recs.each do |rec|
54
- paths = SportDb::Parser::Opts.find( rec['path'], dir: basedir )
55
- specs << [paths, rec]
56
- end
57
- else
58
- paths = if args.empty?
59
- [
60
- '../../../openfootball/euro/2021--europe/euro.txt',
61
- '../../../openfootball/euro/2024--germany/euro.txt',
62
- ]
63
- else
64
- ## check for directories
65
- ## and auto-expand
66
- SportDb::Parser::Opts.expand_args( args )
67
- end
68
- specs << [paths, {}]
69
- end
70
-
71
-
72
- SportDb::Parser::Linter.debug = true if opts[:debug]
73
-
74
- linter = SportDb::Parser::Linter.new
75
-
76
-
77
- specs.each_with_index do |(paths, rec),i|
78
- errors = []
79
-
80
- paths.each_with_index do |path,j|
81
- puts "==> [#{j+1}/#{paths.size}] reading >#{path}<..."
82
- linter.read( path, parse: !opts[:metal] )
83
-
84
- errors += linter.errors if linter.errors?
85
- end
86
-
87
- if errors.size > 0
88
- puts
89
- pp errors
90
- puts
91
- puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
92
- else
93
- puts
94
- puts "OK no parse errors found in #{paths.size} datafile(s)"
95
- end
96
-
97
- ## add errors to rec via rec['errors'] to allow
98
- ## for further processing/reporting
99
- rec['errors'] = errors
100
- end
101
-
102
-
103
- ###
104
- ## generate a report if --file option used
105
- if opts[:file]
106
-
107
- buf = String.new
108
-
109
- buf << "# fbtok summary report - #{specs.size} dataset(s)\n\n"
110
-
111
- specs.each_with_index do |(paths, rec),i|
112
- errors = rec['errors']
113
-
114
- if errors.size > 0
115
- buf << "!! #{errors.size} ERROR(S) "
116
- else
117
- buf << " OK "
118
- end
119
- buf << "%-20s" % rec['path']
120
- buf << " - #{paths.size} datafile(s)"
121
- buf << "\n"
122
-
123
- if errors.size > 0
124
- buf << errors.pretty_inspect
125
- buf << "\n"
126
- end
127
- end
128
-
129
- puts
130
- puts "SUMMARY:"
131
- puts buf
132
-
133
- # maybe write out in the future?
134
- # basedir = File.dirname( opts[:file] )
135
- # basename = File.basename( opts[:file], File.extname( opts[:file] ))
136
- end
137
-
138
-
139
-
140
- end # method self.main
141
- end # module Fbtok
@@ -1,156 +0,0 @@
1
-
2
- module SportDb
3
- class Parser
4
-
5
- ###
6
- ## note - Linter for now nested inside Parser - keep? why? why not?
7
- class Linter
8
-
9
- def self.debug=(value) @@debug = value; end
10
- def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
- def debug?() self.class.debug?; end
12
-
13
-
14
-
15
- attr_reader :errors
16
-
17
- def initialize
18
- @errors = []
19
- @parser = Parser.new ## use own parser instance (not shared) - why? why not?
20
- end
21
-
22
-
23
- def errors?() @errors.size > 0; end
24
-
25
-
26
-
27
- ## note: colon (:) MUST be followed by one (or more) spaces
28
- ## make sure mon feb 12 18:10 will not match
29
- ## allow 1. FC Köln etc.
30
- ## Mainz 05:
31
- ## limit to 30 chars max
32
- ## only allow chars incl. intl buut (NOT ()[]/;)
33
- ##
34
- ## Group A:
35
- ## Group B: - remove colon
36
- ## or lookup first
37
-
38
- ATTRIB_RE = %r{^
39
- [ ]*? # slurp leading spaces
40
- (?<key>[^:|\]\[()\/; -]
41
- [^:|\]\[()\/;]{0,30}
42
- )
43
- [ ]*? # slurp trailing spaces
44
- :[ ]+
45
- (?<value>.+)
46
- [ ]*? # slurp trailing spaces
47
- $
48
- }ix
49
-
50
-
51
- #########
52
- ## parse - false (default) - tokenize (only)
53
- ## - true - tokenize & parse
54
- def read( path, parse: false )
55
- ## note: every (new) read call - resets errors list to empty
56
- @errors = []
57
-
58
- nodes = OutlineReader.read( path )
59
-
60
- ## process nodes
61
- h1 = nil
62
- h2 = nil
63
- orphans = 0 ## track paragraphs's with no heading
64
-
65
- attrib_found = false
66
-
67
-
68
- nodes.each do |node|
69
- type = node[0]
70
-
71
- if type == :h1
72
- h1 = node[1] ## get heading text
73
- puts " = Heading 1 >#{node[1]}<"
74
- elsif type == :h2
75
- if h1.nil?
76
- puts "!! WARN - no heading for subheading; skipping parse"
77
- next
78
- end
79
- h2 = node[1] ## get heading text
80
- puts " == Heading 2 >#{node[1]}<"
81
- elsif type == :p
82
-
83
- if h1.nil?
84
- orphans += 1 ## only warn once
85
- puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
86
- next
87
- end
88
-
89
- lines = node[1]
90
-
91
- tree = []
92
- lines.each_with_index do |line,i|
93
-
94
- if debug?
95
- puts
96
- puts "line >#{line}<"
97
- end
98
-
99
-
100
- ## skip new (experimental attrib syntax)
101
- if attrib_found == false &&
102
- ATTRIB_RE.match?( line )
103
- ## note: check attrib regex AFTER group def e.g.:
104
- ## Group A:
105
- ## Group B: etc.
106
- ## todo/fix - change Group A: to Group A etc.
107
- ## Group B: to Group B
108
- attrib_found = true
109
- ## logger.debug "skipping key/value line - >#{line}<"
110
- next
111
- end
112
-
113
- if attrib_found
114
- ## check if line ends with dot
115
- ## if not slurp up lines to the next do!!!
116
- ## logger.debug "skipping key/value line - >#{line}<"
117
- attrib_found = false if line.end_with?( '.' )
118
- # logger.debug "skipping key/value line (cont.) - >#{line}<"
119
- next
120
- end
121
-
122
- t, error_messages = if parse
123
- @parser.parse_with_errors( line )
124
- else
125
- @parser.tokenize_with_errors( line )
126
- end
127
-
128
-
129
- if error_messages.size > 0
130
- ## add to "global" error list
131
- ## make a triplet tuple (file / msg / line text)
132
- error_messages.each do |msg|
133
- @errors << [ path,
134
- msg,
135
- line
136
- ]
137
- end
138
- end
139
-
140
- pp t if debug?
141
-
142
- tree << t
143
- end
144
-
145
- ## pp tree
146
- else
147
- pp node
148
- raise ArgumentError, "unsupported (node) type >#{type}<"
149
- end
150
- end # each node
151
- end # read
152
- end # class Linter
153
-
154
-
155
- end # class Parser
156
- end # module SportDb
@@ -1,81 +0,0 @@
1
-
2
- module SportDb
3
- class Parser
4
-
5
-
6
- ###
7
- ## note - Opts Helpers for now nested inside Parser - keep here? why? why not?
8
- class Opts
9
-
10
- SEASON_RE = %r{ (?:
11
- \d{4}-\d{2}
12
- | \d{4}(--[a-z0-9_-]+)?
13
- )
14
- }x
15
- SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
16
-
17
-
18
- ## note: if pattern includes directory add here
19
- ## (otherwise move to more "generic" datafile) - why? why not?
20
- ## update - note include/allow dot (.) too
21
- ## e.g. 2024-25/at.1.txt
22
- ## change to at_1 or uefa_cl or such - why? why not?
23
- MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
24
- #{SEASON}
25
- /[a-z0-9_.-]+\.txt$ ## txt e.g /1-premierleague.txt
26
- }x
27
-
28
-
29
- def self.find( path, dir: nil )
30
- ## check - rename dir
31
- ## use root_dir or work_dir or cd or such - why? why not?
32
-
33
- datafiles = []
34
-
35
- ## note: normalize path - use File.expand_path ??
36
- ## change all backslash to slash for now
37
- ## path = path.gsub( "\\", '/' )
38
- path = if dir
39
- File.expand_path( path, File.expand_path( dir ))
40
- else
41
- File.expand_path( path )
42
- end
43
-
44
- ## check all txt files
45
- ## note: incl. files starting with dot (.)) as candidates
46
- ## (normally excluded with just *)
47
- candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
48
- ## pp candidates
49
- candidates.each do |candidate|
50
- datafiles << candidate if MATCH_RE.match( candidate )
51
- end
52
-
53
- ## pp datafiles
54
- datafiles
55
- end
56
-
57
-
58
- def self.expand_args( args )
59
- paths = []
60
-
61
- args.each do |arg|
62
- ## check if directory
63
- if Dir.exist?( arg )
64
- datafiles = find( arg )
65
- puts
66
- puts " found #{datafiles.size} match txt datafiles in #{arg}"
67
- pp datafiles
68
- paths += datafiles
69
- else
70
- ## assume it's a file
71
- paths << arg
72
- end
73
- end
74
-
75
- paths
76
- end
77
- end # class Opts
78
-
79
-
80
- end # class Parser
81
- end # module SportDb