sportdb-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1466b82654b4a4f0f823a96709488dedb595d08731a55abc128691e0ffe2a80b
4
+ data.tar.gz: 14995e94dc079ab61e77d056d15c9a5830dc573129661ca453b2892d087c2061
5
+ SHA512:
6
+ metadata.gz: 75c2b4f455e8bb1b5e471c39f8fa3b5069bd0bb2a808ad8b246c0f2b060c5416f9f56a3619ad7db7ac5f21a6177c762aa28ae8e9c939b03a2569cf27d34f9b81
7
+ data.tar.gz: 9c4f9095a61410499ae7628b1eb3295d8f456e62feae45a4c254d9157904326abf6571f3c4a04c078551b6364cd09252509f709bfeef46a569dbe202f4058460
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ### 0.0.1 / 2024-07-12
2
+
3
+ * Everything is new. First release.
data/Manifest.txt ADDED
@@ -0,0 +1,14 @@
1
+ CHANGELOG.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ bin/fbt
6
+ lib/sportdb/parser.rb
7
+ lib/sportdb/parser/lang.rb
8
+ lib/sportdb/parser/linter.rb
9
+ lib/sportdb/parser/outline_reader.rb
10
+ lib/sportdb/parser/parser.rb
11
+ lib/sportdb/parser/token-date.rb
12
+ lib/sportdb/parser/token-score.rb
13
+ lib/sportdb/parser/token-text.rb
14
+ lib/sportdb/parser/token.rb
data/README.md ADDED
@@ -0,0 +1,8 @@
1
+ # sportdb-parser - football.txt match parser (& tokenizer)
2
+
3
+
4
+
5
+
6
+
7
+
8
+
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ require 'hoe'
2
+
3
+
4
+ Hoe.spec 'sportdb-parser' do
5
+
6
+ self.version = '0.0.1'
7
+
8
+ self.summary = "sportdb-parser - football.txt match parser (& tokenizer)"
9
+ self.description = summary
10
+
11
+ self.urls = { home: 'https://github.com/sportdb/sport.db' }
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'gerald.bauer@gmail.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'CHANGELOG.md'
19
+
20
+ self.licenses = ['Public Domain']
21
+
22
+ self.extra_deps = []
23
+
24
+ self.spec_extras = {
25
+ required_ruby_version: '>= 2.2.2'
26
+ }
27
+ end
data/bin/fbt ADDED
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ## tip: to test run:
4
+ ## ruby -I ./lib bin/fbt
5
+
6
+ require 'sportdb/parser'
7
+
8
+
9
+ require 'optparse'
10
+
11
+ ##
12
+ ## read textfile
13
+ ## and dump tokens
14
+ ##
15
+ ## fbt ../openfootball/.../euro.txt
16
+
17
+
18
+ SEASON_RE = %r{ (?:
19
+ \d{4}-\d{2}
20
+ | \d{4}(--[a-z0-9_-]+)?
21
+ )
22
+ }x
23
+ SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
24
+
25
+
26
+ ## note: if pattern includes directory add here
27
+ ## (otherwise move to more "generic" datafile) - why? why not?
28
+ MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
29
+ #{SEASON}
30
+ /[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
31
+ }x
32
+
33
+
34
+ def find( path, pattern=MATCH_RE )
35
+ datafiles = []
36
+
37
+ ## check all txt files
38
+ ## note: incl. files starting with dot (.)) as candidates (normally excluded with just *)
39
+ candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
40
+ ## pp candidates
41
+ candidates.each do |candidate|
42
+ datafiles << candidate if pattern.match( candidate )
43
+ end
44
+
45
+ ## pp datafiles
46
+ datafiles
47
+ end
48
+
49
+
50
+
51
+
52
+
53
+
54
+ args = ARGV
55
+ opts = { debug: false,
56
+ metal: false }
57
+
58
+ parser = OptionParser.new do |parser|
59
+ parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
60
+
61
+ ##
62
+ ## check if git has a offline option?? (use same)
63
+ ## check for other tools - why? why not?
64
+
65
+
66
+ parser.on( "--verbose", "--debug",
67
+ "turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
68
+ opts[:debug] = debug
69
+ end
70
+
71
+ parser.on( "--metal",
72
+ "turn off typed parse tree; show to the metal tokens"+
73
+ " (default: #{opts[:metal]})" ) do |metal|
74
+ opts[:metal] = metal
75
+ end
76
+ end
77
+ parser.parse!( args )
78
+
79
+ puts "OPTS:"
80
+ p opts
81
+ puts "ARGV:"
82
+ p args
83
+
84
+
85
+
86
+
87
+
88
+
89
+ def expand_args( args )
90
+ paths = []
91
+
92
+ args.each do |arg|
93
+ ## check if directory
94
+ if Dir.exist?( arg )
95
+ datafiles = find( arg )
96
+ puts
97
+ puts " found #{datafiles.size} match txt datafiles in #{arg}"
98
+ pp datafiles
99
+ paths += datafiles
100
+ else
101
+ ## assume it's a file
102
+ paths << arg
103
+ end
104
+ end
105
+
106
+ paths
107
+ end
108
+
109
+
110
+ paths = if args.empty?
111
+ [
112
+ '../../../openfootball/euro/2020--europe/euro.txt',
113
+ '../../../openfootball/euro/2024--germany/euro.txt',
114
+ ]
115
+ else
116
+ ## check for directories
117
+ ## and auto-expand
118
+
119
+ expand_args( args )
120
+ end
121
+
122
+
123
+
124
+ SportDb::Parser::Linter.debug = true if opts[:debug]
125
+
126
+ linter = SportDb::Parser::Linter.new
127
+
128
+
129
+
130
+ paths.each_with_index do |path,i|
131
+ puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
132
+ linter.read( path, parse: !opts[:metal] )
133
+ end
134
+
135
+ if linter.errors?
136
+ puts
137
+ pp linter.errors
138
+ puts "!! #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
139
+ else
140
+ puts "OK no parse errors found in #{paths.size} datafile(s)"
141
+ end
142
+
143
+ puts "bye"
144
+
@@ -0,0 +1,111 @@
1
+
2
+ ## use Sports (not SportDb) for module - why? why not?
3
+
4
+
5
+
6
+ module SportDb
7
+ class Parser
8
+
9
+ ## Group A-Z
10
+ ## Group 1-99
11
+ ## Group HEX # used in concaf world cup quali
12
+ ## Group 1A or A1, B1 - used anywhere
13
+ ##
14
+ ## use "key" of group - why? why not?
15
+
16
+ GROUP_RE = %r{^
17
+ Group [ ]
18
+ (?<key>[a-z0-9]+)
19
+ $}ix
20
+ def is_group?( text )
21
+ ## use regex for match
22
+ GROUP_RE.match?( text )
23
+ end
24
+
25
+
26
+
27
+
28
+ ROUND_RE = %r{^(
29
+
30
+ # round - note - requiers number e.g. round 1,2, etc.
31
+ (?: (?: Round |
32
+ Matchday |
33
+ Week
34
+ )
35
+ [ ] [0-9]+
36
+ )
37
+ |
38
+ # more (kockout) rounds
39
+ # playoffs - playoff, play-off, play-offs
40
+ (?: Play-?offs?
41
+ (?: [ ]for[ ]quarter-?finals )?
42
+ )
43
+ |
44
+ # round32
45
+ (?: Round[ ]of[ ]32 |
46
+ Last[ ]32 )
47
+ |
48
+ # round16
49
+ (?: Round[ ]of[ ]16 |
50
+ Last[ ]16 |
51
+ 8th[ ]finals )
52
+ |
53
+ # fifthplace
54
+ (?:
55
+ (?: (Fifth|5th)[ -]place
56
+ (?: [ ] (?: match|play-?off|final ))?
57
+ ) |
58
+ (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
59
+ )
60
+ |
61
+ # thirdplace
62
+ (?:
63
+ (?: (Third|3rd)[ -]place
64
+ (?: [ ] (?: match|play-?off|final ))?
65
+ ) |
66
+ (?: Match[ ]for[ ](?: third|3rd )[ -]place )
67
+ )
68
+ |
69
+ # quarterfinals
70
+ (?:
71
+ Quarter-?finals? |
72
+ Quarters |
73
+ Last[ ]8
74
+ )
75
+ |
76
+ # semifinals
77
+ (?:
78
+ Semi-?finals? |
79
+ Semis |
80
+ Last[ ]4
81
+ )
82
+ |
83
+ # final
84
+ Finals?
85
+
86
+ )$}ix
87
+
88
+
89
+ def is_round?( text )
90
+ ROUND_RE.match?( text )
91
+ end
92
+
93
+ ##
94
+ ## keep leg separate (from round) - why? why not?
95
+ ##
96
+ LEG_RE = %r{^
97
+ # leg1
98
+ (?: 1st|First)[ ]leg
99
+ |
100
+ # leg2
101
+ (?: 2nd|Second)[ ]leg
102
+ $}ix
103
+
104
+ ### Pair matches/games if marked with leg1 n leg2
105
+ def is_leg?( text )
106
+ LEG_RE.match?( text )
107
+ end
108
+
109
+
110
+ end # class Parser
111
+ end # module SportDb
@@ -0,0 +1,153 @@
1
+
2
+ module SportDb
3
+ class Parser
4
+
5
+ ###
6
+ ## note - Linter for now nested inside Parser - keep? why? why not?
7
+ class Linter
8
+
9
+ def self.debug=(value) @@debug = value; end
10
+ def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
+ def debug?() self.class.debug?; end
12
+
13
+ ## keep typed - why? why not?
14
+ ## - used anywhere?
15
+ def self.typed=(value) @@typed = value; end
16
+ def self.typed?() @@typed ||= true; end ## note: default is TRUE
17
+ def typed?() self.class.typed?; end
18
+
19
+
20
+
21
+ attr_reader :errors
22
+
23
+ def initialize
24
+ @errors = []
25
+ @parser = Parser.new ## use own parser instance (not shared) - why? why not?
26
+ end
27
+
28
+
29
+ def errors?() @errors.size > 0; end
30
+
31
+
32
+
33
+ ## note: colon (:) MUST be followed by one (or more) spaces
34
+ ## make sure mon feb 12 18:10 will not match
35
+ ## allow 1. FC Köln etc.
36
+ ## Mainz 05:
37
+ ## limit to 30 chars max
38
+ ## only allow chars incl. intl buut (NOT ()[]/;)
39
+ ##
40
+ ## Group A:
41
+ ## Group B: - remove colon
42
+ ## or lookup first
43
+
44
+ ATTRIB_RE = %r{^
45
+ [ ]*? # slurp leading spaces
46
+ (?<key>[^:|\]\[()\/; -]
47
+ [^:|\]\[()\/;]{0,30}
48
+ )
49
+ [ ]*? # slurp trailing spaces
50
+ :[ ]+
51
+ (?<value>.+)
52
+ [ ]*? # slurp trailing spaces
53
+ $
54
+ }ix
55
+
56
+
57
+ #########
58
+ ## parse - false (default) - tokenize (only)
59
+ ## - true - tokenize & parse
60
+ def read( path, parse: false )
61
+ nodes = OutlineReader.read( path )
62
+
63
+ ## process nodes
64
+ h1 = nil
65
+ orphans = 0 ## track paragraphs's with no heading
66
+
67
+ attrib_found = false
68
+
69
+
70
+ nodes.each do |node|
71
+ type = node[0]
72
+
73
+ if type == :h1
74
+ h1 = node[1] ## get heading text
75
+ puts
76
+ puts " = Heading 1 >#{node[1]}<"
77
+ elsif type == :p
78
+
79
+ if h1.nil?
80
+ orphans += 1 ## only warn once
81
+ puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
82
+ next
83
+ end
84
+
85
+ lines = node[1]
86
+
87
+ tree = []
88
+ lines.each_with_index do |line,i|
89
+
90
+ if debug?
91
+ puts
92
+ puts "line >#{line}<"
93
+ end
94
+
95
+
96
+ ## skip new (experimental attrib syntax)
97
+ m = nil
98
+ if attrib_found == false &&
99
+ m=ATTRIB_RE.match( line )
100
+ ## note: check attrib regex AFTER group def e.g.:
101
+ ## Group A:
102
+ ## Group B: etc.
103
+ ## todo/fix - change Group A: to Group A etc.
104
+ ## Group B: to Group B
105
+ attrib_found = true
106
+ ## logger.debug "skipping key/value line - >#{line}<"
107
+ next
108
+ end
109
+
110
+ if attrib_found
111
+ ## check if line ends with dot
112
+ ## if not slurp up lines to the next do!!!
113
+ ## logger.debug "skipping key/value line - >#{line}<"
114
+ attrib_found = false if line.end_with?( '.' )
115
+ # logger.debug "skipping key/value line (cont.) - >#{line}<"
116
+ next
117
+ end
118
+
119
+ t, error_messages = if parse
120
+ @parser.parse_with_errors( line )
121
+ else
122
+ @parser.tokenize_with_errors( line )
123
+ end
124
+
125
+
126
+ if error_messages.size > 0
127
+ ## add to "global" error list
128
+ ## make a triplet tuple (file / msg / line text)
129
+ error_messages.each do |msg|
130
+ @errors << [ path,
131
+ msg,
132
+ line
133
+ ]
134
+ end
135
+ end
136
+
137
+ pp t if debug?
138
+
139
+ tree << t
140
+ end
141
+
142
+ ## pp tree
143
+ else
144
+ pp node
145
+ raise ArgumentError, "unsupported (node) type >#{type}<"
146
+ end
147
+ end # each node
148
+ end # read
149
+ end # class Linter
150
+
151
+
152
+ end # class Parser
153
+ end # module SportDb
@@ -0,0 +1,101 @@
1
+
2
+ ###
3
+ ## todo/fix - move to sportdb-parser - why? why not? !!!!!!
4
+ ##
5
+
6
+
7
+ module SportDb
8
+
9
+ class OutlineReader
10
+
11
+ def self.debug=(value) @@debug = value; end
12
+ def self.debug?() @@debug ||= false; end
13
+ def debug?() self.class.debug?; end
14
+
15
+
16
+
17
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
18
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
19
+ parse( txt )
20
+ end
21
+
22
+ def self.parse( txt )
23
+ new( txt ).parse
24
+ end
25
+
26
+ def initialize( txt )
27
+ @txt = txt
28
+ end
29
+
30
+ ## note: skip "decorative" only heading e.g. ========
31
+ ## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
32
+ HEADING_BLANK_RE = %r{\A
33
+ ={1,}
34
+ \z}x
35
+
36
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
37
+ HEADING_RE = %r{\A
38
+ (?<marker>={1,}) ## 1. leading ======
39
+ [ ]*
40
+ (?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
41
+ [ ]*
42
+ =* ## 3. (optional) trailing ====
43
+ \z}x
44
+
45
+ def parse
46
+ outline=[] ## outline structure
47
+ start_para = true ## start new para(graph) on new text line?
48
+
49
+ @txt.each_line do |line|
50
+ line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
51
+
52
+ if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
53
+ start_para = true
54
+ next
55
+ end
56
+
57
+ break if line == '__END__'
58
+
59
+ next if line.start_with?( '#' ) ## skip comments too
60
+ ## strip inline (until end-of-line) comments too
61
+ ## e.g Eupen | KAS Eupen ## [de]
62
+ ## => Eupen | KAS Eupen
63
+ ## e.g bq Bonaire, BOE # CONCACAF
64
+ ## => bq Bonaire, BOE
65
+ line = line.sub( /#.*/, '' ).strip
66
+ pp line if debug?
67
+
68
+ ## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
69
+ next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
70
+
71
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
72
+ if m=HEADING_RE.match( line )
73
+ start_para = true
74
+
75
+ heading_marker = m[:marker]
76
+ heading_level = m[:marker].length ## count number of = for heading level
77
+ heading = m[:text].strip
78
+
79
+ puts "heading #{heading_level} >#{heading}<" if debug?
80
+ outline << [:"h#{heading_level}", heading]
81
+ else ## assume it's a (plain/regular) text line
82
+ if start_para
83
+ outline << [:p, [line]]
84
+ start_para = false
85
+ else
86
+ node = outline[-1] ## get last entry
87
+ if node[0] == :p ## assert it's a p(aragraph) node!!!
88
+ node[1] << line ## add line to p(aragraph)
89
+ else
90
+ puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
91
+ pp node
92
+ exit 1
93
+ end
94
+ end
95
+ end
96
+ end
97
+ outline
98
+ end # method read
99
+ end # class OutlineReader
100
+
101
+ end # module SportDb