sportdb-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1466b82654b4a4f0f823a96709488dedb595d08731a55abc128691e0ffe2a80b
4
+ data.tar.gz: 14995e94dc079ab61e77d056d15c9a5830dc573129661ca453b2892d087c2061
5
+ SHA512:
6
+ metadata.gz: 75c2b4f455e8bb1b5e471c39f8fa3b5069bd0bb2a808ad8b246c0f2b060c5416f9f56a3619ad7db7ac5f21a6177c762aa28ae8e9c939b03a2569cf27d34f9b81
7
+ data.tar.gz: 9c4f9095a61410499ae7628b1eb3295d8f456e62feae45a4c254d9157904326abf6571f3c4a04c078551b6364cd09252509f709bfeef46a569dbe202f4058460
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ### 0.0.1 / 2024-07-12
2
+
3
+ * Everything is new. First release.
data/Manifest.txt ADDED
@@ -0,0 +1,14 @@
1
+ CHANGELOG.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ bin/fbt
6
+ lib/sportdb/parser.rb
7
+ lib/sportdb/parser/lang.rb
8
+ lib/sportdb/parser/linter.rb
9
+ lib/sportdb/parser/outline_reader.rb
10
+ lib/sportdb/parser/parser.rb
11
+ lib/sportdb/parser/token-date.rb
12
+ lib/sportdb/parser/token-score.rb
13
+ lib/sportdb/parser/token-text.rb
14
+ lib/sportdb/parser/token.rb
data/README.md ADDED
@@ -0,0 +1,8 @@
1
+ # sportdb-parser - football.txt match parser (& tokenizer)
2
+
3
+
4
+
5
+
6
+
7
+
8
+
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ require 'hoe'
2
+
3
+
4
+ Hoe.spec 'sportdb-parser' do
5
+
6
+ self.version = '0.0.1'
7
+
8
+ self.summary = "sportdb-parser - football.txt match parser (& tokenizer)"
9
+ self.description = summary
10
+
11
+ self.urls = { home: 'https://github.com/sportdb/sport.db' }
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'gerald.bauer@gmail.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'CHANGELOG.md'
19
+
20
+ self.licenses = ['Public Domain']
21
+
22
+ self.extra_deps = []
23
+
24
+ self.spec_extras = {
25
+ required_ruby_version: '>= 2.2.2'
26
+ }
27
+ end
data/bin/fbt ADDED
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ## tip: to test run:
4
+ ## ruby -I ./lib bin/fbt
5
+
6
+ require 'sportdb/parser'
7
+
8
+
9
+ require 'optparse'
10
+
11
+ ##
12
+ ## read textfile
13
+ ## and dump tokens
14
+ ##
15
+ ## fbt ../openfootball/.../euro.txt
16
+
17
+
18
+ SEASON_RE = %r{ (?:
19
+ \d{4}-\d{2}
20
+ | \d{4}(--[a-z0-9_-]+)?
21
+ )
22
+ }x
23
+ SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
24
+
25
+
26
+ ## note: if pattern includes directory add here
27
+ ## (otherwise move to more "generic" datafile) - why? why not?
28
+ MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
29
+ #{SEASON}
30
+ /[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
31
+ }x
32
+
33
+
34
+ def find( path, pattern=MATCH_RE )
35
+ datafiles = []
36
+
37
+ ## check all txt files
38
+ ## note: incl. files starting with dot (.)) as candidates (normally excluded with just *)
39
+ candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
40
+ ## pp candidates
41
+ candidates.each do |candidate|
42
+ datafiles << candidate if pattern.match( candidate )
43
+ end
44
+
45
+ ## pp datafiles
46
+ datafiles
47
+ end
48
+
49
+
50
+
51
+
52
+
53
+
54
+ args = ARGV
55
+ opts = { debug: false,
56
+ metal: false }
57
+
58
+ parser = OptionParser.new do |parser|
59
+ parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
60
+
61
+ ##
62
+ ## check if git has a offline option?? (use same)
63
+ ## check for other tools - why? why not?
64
+
65
+
66
+ parser.on( "--verbose", "--debug",
67
+ "turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
68
+ opts[:debug] = debug
69
+ end
70
+
71
+ parser.on( "--metal",
72
+ "turn off typed parse tree; show to the metal tokens"+
73
+ " (default: #{opts[:metal]})" ) do |metal|
74
+ opts[:metal] = metal
75
+ end
76
+ end
77
+ parser.parse!( args )
78
+
79
+ puts "OPTS:"
80
+ p opts
81
+ puts "ARGV:"
82
+ p args
83
+
84
+
85
+
86
+
87
+
88
+
89
+ def expand_args( args )
90
+ paths = []
91
+
92
+ args.each do |arg|
93
+ ## check if directory
94
+ if Dir.exist?( arg )
95
+ datafiles = find( arg )
96
+ puts
97
+ puts " found #{datafiles.size} match txt datafiles in #{arg}"
98
+ pp datafiles
99
+ paths += datafiles
100
+ else
101
+ ## assume it's a file
102
+ paths << arg
103
+ end
104
+ end
105
+
106
+ paths
107
+ end
108
+
109
+
110
+ paths = if args.empty?
111
+ [
112
+ '../../../openfootball/euro/2020--europe/euro.txt',
113
+ '../../../openfootball/euro/2024--germany/euro.txt',
114
+ ]
115
+ else
116
+ ## check for directories
117
+ ## and auto-expand
118
+
119
+ expand_args( args )
120
+ end
121
+
122
+
123
+
124
+ SportDb::Parser::Linter.debug = true if opts[:debug]
125
+
126
+ linter = SportDb::Parser::Linter.new
127
+
128
+
129
+
130
+ paths.each_with_index do |path,i|
131
+ puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
132
+ linter.read( path, parse: !opts[:metal] )
133
+ end
134
+
135
+ if linter.errors?
136
+ puts
137
+ pp linter.errors
138
+ puts "!! #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
139
+ else
140
+ puts "OK no parse errors found in #{paths.size} datafile(s)"
141
+ end
142
+
143
+ puts "bye"
144
+
@@ -0,0 +1,111 @@
1
+
2
+ ## use Sports (not SportDb) for module - why? why not?
3
+
4
+
5
+
6
+ module SportDb
7
+ class Parser
8
+
9
+ ## Group A-Z
10
+ ## Group 1-99
11
+ ## Group HEX # used in concaf world cup quali
12
+ ## Group 1A or A1, B1 - used anywhere
13
+ ##
14
+ ## use "key" of group - why? why not?
15
+
16
+ GROUP_RE = %r{^
17
+ Group [ ]
18
+ (?<key>[a-z0-9]+)
19
+ $}ix
20
+ def is_group?( text )
21
+ ## use regex for match
22
+ GROUP_RE.match?( text )
23
+ end
24
+
25
+
26
+
27
+
28
+ ROUND_RE = %r{^(
29
+
30
+ # round - note - requiers number e.g. round 1,2, etc.
31
+ (?: (?: Round |
32
+ Matchday |
33
+ Week
34
+ )
35
+ [ ] [0-9]+
36
+ )
37
+ |
38
+ # more (kockout) rounds
39
+ # playoffs - playoff, play-off, play-offs
40
+ (?: Play-?offs?
41
+ (?: [ ]for[ ]quarter-?finals )?
42
+ )
43
+ |
44
+ # round32
45
+ (?: Round[ ]of[ ]32 |
46
+ Last[ ]32 )
47
+ |
48
+ # round16
49
+ (?: Round[ ]of[ ]16 |
50
+ Last[ ]16 |
51
+ 8th[ ]finals )
52
+ |
53
+ # fifthplace
54
+ (?:
55
+ (?: (Fifth|5th)[ -]place
56
+ (?: [ ] (?: match|play-?off|final ))?
57
+ ) |
58
+ (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
59
+ )
60
+ |
61
+ # thirdplace
62
+ (?:
63
+ (?: (Third|3rd)[ -]place
64
+ (?: [ ] (?: match|play-?off|final ))?
65
+ ) |
66
+ (?: Match[ ]for[ ](?: third|3rd )[ -]place )
67
+ )
68
+ |
69
+ # quarterfinals
70
+ (?:
71
+ Quarter-?finals? |
72
+ Quarters |
73
+ Last[ ]8
74
+ )
75
+ |
76
+ # semifinals
77
+ (?:
78
+ Semi-?finals? |
79
+ Semis |
80
+ Last[ ]4
81
+ )
82
+ |
83
+ # final
84
+ Finals?
85
+
86
+ )$}ix
87
+
88
+
89
+ def is_round?( text )
90
+ ROUND_RE.match?( text )
91
+ end
92
+
93
+ ##
94
+ ## keep leg separate (from round) - why? why not?
95
+ ##
96
+ LEG_RE = %r{^
97
+ # leg1
98
+ (?: 1st|First)[ ]leg
99
+ |
100
+ # leg2
101
+ (?: 2nd|Second)[ ]leg
102
+ $}ix
103
+
104
+ ### Pair matches/games if marked with leg1 n leg2
105
+ def is_leg?( text )
106
+ LEG_RE.match?( text )
107
+ end
108
+
109
+
110
+ end # class Parser
111
+ end # module SportDb
@@ -0,0 +1,153 @@
1
+
2
+ module SportDb
3
+ class Parser
4
+
5
+ ###
6
+ ## note - Linter for now nested inside Parser - keep? why? why not?
7
+ class Linter
8
+
9
+ def self.debug=(value) @@debug = value; end
10
+ def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
+ def debug?() self.class.debug?; end
12
+
13
+ ## keep typed - why? why not?
14
+ ## - used anywhere?
15
+ def self.typed=(value) @@typed = value; end
16
+ def self.typed?() @@typed ||= true; end ## note: default is TRUE
17
+ def typed?() self.class.typed?; end
18
+
19
+
20
+
21
+ attr_reader :errors
22
+
23
+ def initialize
24
+ @errors = []
25
+ @parser = Parser.new ## use own parser instance (not shared) - why? why not?
26
+ end
27
+
28
+
29
+ def errors?() @errors.size > 0; end
30
+
31
+
32
+
33
+ ## note: colon (:) MUST be followed by one (or more) spaces
34
+ ## make sure mon feb 12 18:10 will not match
35
+ ## allow 1. FC Köln etc.
36
+ ## Mainz 05:
37
+ ## limit to 30 chars max
38
+ ## only allow chars incl. intl buut (NOT ()[]/;)
39
+ ##
40
+ ## Group A:
41
+ ## Group B: - remove colon
42
+ ## or lookup first
43
+
44
+ ATTRIB_RE = %r{^
45
+ [ ]*? # slurp leading spaces
46
+ (?<key>[^:|\]\[()\/; -]
47
+ [^:|\]\[()\/;]{0,30}
48
+ )
49
+ [ ]*? # slurp trailing spaces
50
+ :[ ]+
51
+ (?<value>.+)
52
+ [ ]*? # slurp trailing spaces
53
+ $
54
+ }ix
55
+
56
+
57
+ #########
58
+ ## parse - false (default) - tokenize (only)
59
+ ## - true - tokenize & parse
60
+ def read( path, parse: false )
61
+ nodes = OutlineReader.read( path )
62
+
63
+ ## process nodes
64
+ h1 = nil
65
+ orphans = 0 ## track paragraphs's with no heading
66
+
67
+ attrib_found = false
68
+
69
+
70
+ nodes.each do |node|
71
+ type = node[0]
72
+
73
+ if type == :h1
74
+ h1 = node[1] ## get heading text
75
+ puts
76
+ puts " = Heading 1 >#{node[1]}<"
77
+ elsif type == :p
78
+
79
+ if h1.nil?
80
+ orphans += 1 ## only warn once
81
+ puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
82
+ next
83
+ end
84
+
85
+ lines = node[1]
86
+
87
+ tree = []
88
+ lines.each_with_index do |line,i|
89
+
90
+ if debug?
91
+ puts
92
+ puts "line >#{line}<"
93
+ end
94
+
95
+
96
+ ## skip new (experimental attrib syntax)
97
+ m = nil
98
+ if attrib_found == false &&
99
+ m=ATTRIB_RE.match( line )
100
+ ## note: check attrib regex AFTER group def e.g.:
101
+ ## Group A:
102
+ ## Group B: etc.
103
+ ## todo/fix - change Group A: to Group A etc.
104
+ ## Group B: to Group B
105
+ attrib_found = true
106
+ ## logger.debug "skipping key/value line - >#{line}<"
107
+ next
108
+ end
109
+
110
+ if attrib_found
111
+ ## check if line ends with dot
112
+ ## if not slurp up lines to the next do!!!
113
+ ## logger.debug "skipping key/value line - >#{line}<"
114
+ attrib_found = false if line.end_with?( '.' )
115
+ # logger.debug "skipping key/value line (cont.) - >#{line}<"
116
+ next
117
+ end
118
+
119
+ t, error_messages = if parse
120
+ @parser.parse_with_errors( line )
121
+ else
122
+ @parser.tokenize_with_errors( line )
123
+ end
124
+
125
+
126
+ if error_messages.size > 0
127
+ ## add to "global" error list
128
+ ## make a triplet tuple (file / msg / line text)
129
+ error_messages.each do |msg|
130
+ @errors << [ path,
131
+ msg,
132
+ line
133
+ ]
134
+ end
135
+ end
136
+
137
+ pp t if debug?
138
+
139
+ tree << t
140
+ end
141
+
142
+ ## pp tree
143
+ else
144
+ pp node
145
+ raise ArgumentError, "unsupported (node) type >#{type}<"
146
+ end
147
+ end # each node
148
+ end # read
149
+ end # class Linter
150
+
151
+
152
+ end # class Parser
153
+ end # module SportDb
@@ -0,0 +1,101 @@
1
+
2
+ ###
3
+ ## todo/fix - move to sportdb-parser - why? why not? !!!!!!
4
+ ##
5
+
6
+
7
+ module SportDb
8
+
9
+ class OutlineReader
10
+
11
+ def self.debug=(value) @@debug = value; end
12
+ def self.debug?() @@debug ||= false; end
13
+ def debug?() self.class.debug?; end
14
+
15
+
16
+
17
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
18
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
19
+ parse( txt )
20
+ end
21
+
22
+ def self.parse( txt )
23
+ new( txt ).parse
24
+ end
25
+
26
+ def initialize( txt )
27
+ @txt = txt
28
+ end
29
+
30
+ ## note: skip "decorative" only heading e.g. ========
31
+ ## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
32
+ HEADING_BLANK_RE = %r{\A
33
+ ={1,}
34
+ \z}x
35
+
36
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
37
+ HEADING_RE = %r{\A
38
+ (?<marker>={1,}) ## 1. leading ======
39
+ [ ]*
40
+ (?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
41
+ [ ]*
42
+ =* ## 3. (optional) trailing ====
43
+ \z}x
44
+
45
+ def parse
46
+ outline=[] ## outline structure
47
+ start_para = true ## start new para(graph) on new text line?
48
+
49
+ @txt.each_line do |line|
50
+ line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
51
+
52
+ if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
53
+ start_para = true
54
+ next
55
+ end
56
+
57
+ break if line == '__END__'
58
+
59
+ next if line.start_with?( '#' ) ## skip comments too
60
+ ## strip inline (until end-of-line) comments too
61
+ ## e.g Eupen | KAS Eupen ## [de]
62
+ ## => Eupen | KAS Eupen
63
+ ## e.g bq Bonaire, BOE # CONCACAF
64
+ ## => bq Bonaire, BOE
65
+ line = line.sub( /#.*/, '' ).strip
66
+ pp line if debug?
67
+
68
+ ## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
69
+ next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
70
+
71
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
72
+ if m=HEADING_RE.match( line )
73
+ start_para = true
74
+
75
+ heading_marker = m[:marker]
76
+ heading_level = m[:marker].length ## count number of = for heading level
77
+ heading = m[:text].strip
78
+
79
+ puts "heading #{heading_level} >#{heading}<" if debug?
80
+ outline << [:"h#{heading_level}", heading]
81
+ else ## assume it's a (plain/regular) text line
82
+ if start_para
83
+ outline << [:p, [line]]
84
+ start_para = false
85
+ else
86
+ node = outline[-1] ## get last entry
87
+ if node[0] == :p ## assert it's a p(aragraph) node!!!
88
+ node[1] << line ## add line to p(aragraph)
89
+ else
90
+ puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
91
+ pp node
92
+ exit 1
93
+ end
94
+ end
95
+ end
96
+ end
97
+ outline
98
+ end # method read
99
+ end # class OutlineReader
100
+
101
+ end # module SportDb