sportdb-parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +14 -0
- data/README.md +8 -0
- data/Rakefile +27 -0
- data/bin/fbt +144 -0
- data/lib/sportdb/parser/lang.rb +111 -0
- data/lib/sportdb/parser/linter.rb +153 -0
- data/lib/sportdb/parser/outline_reader.rb +101 -0
- data/lib/sportdb/parser/parser.rb +196 -0
- data/lib/sportdb/parser/token-date.rb +193 -0
- data/lib/sportdb/parser/token-score.rb +121 -0
- data/lib/sportdb/parser/token-text.rb +114 -0
- data/lib/sportdb/parser/token.rb +364 -0
- data/lib/sportdb/parser.rb +44 -0
- metadata +96 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1466b82654b4a4f0f823a96709488dedb595d08731a55abc128691e0ffe2a80b
|
4
|
+
data.tar.gz: 14995e94dc079ab61e77d056d15c9a5830dc573129661ca453b2892d087c2061
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 75c2b4f455e8bb1b5e471c39f8fa3b5069bd0bb2a808ad8b246c0f2b060c5416f9f56a3619ad7db7ac5f21a6177c762aa28ae8e9c939b03a2569cf27d34f9b81
|
7
|
+
data.tar.gz: 9c4f9095a61410499ae7628b1eb3295d8f456e62feae45a4c254d9157904326abf6571f3c4a04c078551b6364cd09252509f709bfeef46a569dbe202f4058460
|
data/CHANGELOG.md
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
CHANGELOG.md
|
2
|
+
Manifest.txt
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
bin/fbt
|
6
|
+
lib/sportdb/parser.rb
|
7
|
+
lib/sportdb/parser/lang.rb
|
8
|
+
lib/sportdb/parser/linter.rb
|
9
|
+
lib/sportdb/parser/outline_reader.rb
|
10
|
+
lib/sportdb/parser/parser.rb
|
11
|
+
lib/sportdb/parser/token-date.rb
|
12
|
+
lib/sportdb/parser/token-score.rb
|
13
|
+
lib/sportdb/parser/token-text.rb
|
14
|
+
lib/sportdb/parser/token.rb
|
data/README.md
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'hoe'
|
2
|
+
|
3
|
+
|
4
|
+
Hoe.spec 'sportdb-parser' do
|
5
|
+
|
6
|
+
self.version = '0.0.1'
|
7
|
+
|
8
|
+
self.summary = "sportdb-parser - football.txt match parser (& tokenizer)"
|
9
|
+
self.description = summary
|
10
|
+
|
11
|
+
self.urls = { home: 'https://github.com/sportdb/sport.db' }
|
12
|
+
|
13
|
+
self.author = 'Gerald Bauer'
|
14
|
+
self.email = 'gerald.bauer@gmail.com'
|
15
|
+
|
16
|
+
# switch extension to .markdown for gihub formatting
|
17
|
+
self.readme_file = 'README.md'
|
18
|
+
self.history_file = 'CHANGELOG.md'
|
19
|
+
|
20
|
+
self.licenses = ['Public Domain']
|
21
|
+
|
22
|
+
self.extra_deps = []
|
23
|
+
|
24
|
+
self.spec_extras = {
|
25
|
+
required_ruby_version: '>= 2.2.2'
|
26
|
+
}
|
27
|
+
end
|
data/bin/fbt
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
## tip: to test run:
|
4
|
+
## ruby -I ./lib bin/fbt
|
5
|
+
|
6
|
+
require 'sportdb/parser'
|
7
|
+
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
##
|
12
|
+
## read textfile
|
13
|
+
## and dump tokens
|
14
|
+
##
|
15
|
+
## fbt ../openfootball/.../euro.txt
|
16
|
+
|
17
|
+
|
18
|
+
SEASON_RE = %r{ (?:
|
19
|
+
\d{4}-\d{2}
|
20
|
+
| \d{4}(--[a-z0-9_-]+)?
|
21
|
+
)
|
22
|
+
}x
|
23
|
+
SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
|
24
|
+
|
25
|
+
|
26
|
+
## note: if pattern includes directory add here
|
27
|
+
## (otherwise move to more "generic" datafile) - why? why not?
|
28
|
+
MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
29
|
+
#{SEASON}
|
30
|
+
/[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
|
31
|
+
}x
|
32
|
+
|
33
|
+
|
34
|
+
def find( path, pattern=MATCH_RE )
|
35
|
+
datafiles = []
|
36
|
+
|
37
|
+
## check all txt files
|
38
|
+
## note: incl. files starting with dot (.)) as candidates (normally excluded with just *)
|
39
|
+
candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
|
40
|
+
## pp candidates
|
41
|
+
candidates.each do |candidate|
|
42
|
+
datafiles << candidate if pattern.match( candidate )
|
43
|
+
end
|
44
|
+
|
45
|
+
## pp datafiles
|
46
|
+
datafiles
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
args = ARGV
|
55
|
+
opts = { debug: false,
|
56
|
+
metal: false }
|
57
|
+
|
58
|
+
parser = OptionParser.new do |parser|
|
59
|
+
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
60
|
+
|
61
|
+
##
|
62
|
+
## check if git has a offline option?? (use same)
|
63
|
+
## check for other tools - why? why not?
|
64
|
+
|
65
|
+
|
66
|
+
parser.on( "--verbose", "--debug",
|
67
|
+
"turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
|
68
|
+
opts[:debug] = debug
|
69
|
+
end
|
70
|
+
|
71
|
+
parser.on( "--metal",
|
72
|
+
"turn off typed parse tree; show to the metal tokens"+
|
73
|
+
" (default: #{opts[:metal]})" ) do |metal|
|
74
|
+
opts[:metal] = metal
|
75
|
+
end
|
76
|
+
end
|
77
|
+
parser.parse!( args )
|
78
|
+
|
79
|
+
puts "OPTS:"
|
80
|
+
p opts
|
81
|
+
puts "ARGV:"
|
82
|
+
p args
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
def expand_args( args )
|
90
|
+
paths = []
|
91
|
+
|
92
|
+
args.each do |arg|
|
93
|
+
## check if directory
|
94
|
+
if Dir.exist?( arg )
|
95
|
+
datafiles = find( arg )
|
96
|
+
puts
|
97
|
+
puts " found #{datafiles.size} match txt datafiles in #{arg}"
|
98
|
+
pp datafiles
|
99
|
+
paths += datafiles
|
100
|
+
else
|
101
|
+
## assume it's a file
|
102
|
+
paths << arg
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
paths
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
paths = if args.empty?
|
111
|
+
[
|
112
|
+
'../../../openfootball/euro/2020--europe/euro.txt',
|
113
|
+
'../../../openfootball/euro/2024--germany/euro.txt',
|
114
|
+
]
|
115
|
+
else
|
116
|
+
## check for directories
|
117
|
+
## and auto-expand
|
118
|
+
|
119
|
+
expand_args( args )
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
SportDb::Parser::Linter.debug = true if opts[:debug]
|
125
|
+
|
126
|
+
linter = SportDb::Parser::Linter.new
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
paths.each_with_index do |path,i|
|
131
|
+
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
132
|
+
linter.read( path, parse: !opts[:metal] )
|
133
|
+
end
|
134
|
+
|
135
|
+
if linter.errors?
|
136
|
+
puts
|
137
|
+
pp linter.errors
|
138
|
+
puts "!! #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
139
|
+
else
|
140
|
+
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
141
|
+
end
|
142
|
+
|
143
|
+
puts "bye"
|
144
|
+
|
@@ -0,0 +1,111 @@
|
|
1
|
+
|
2
|
+
## use Sports (not SportDb) for module - why? why not?
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
module SportDb
|
7
|
+
class Parser
|
8
|
+
|
9
|
+
## Group A-Z
|
10
|
+
## Group 1-99
|
11
|
+
## Group HEX # used in concaf world cup quali
|
12
|
+
## Group 1A or A1, B1 - used anywhere
|
13
|
+
##
|
14
|
+
## use "key" of group - why? why not?
|
15
|
+
|
16
|
+
GROUP_RE = %r{^
|
17
|
+
Group [ ]
|
18
|
+
(?<key>[a-z0-9]+)
|
19
|
+
$}ix
|
20
|
+
def is_group?( text )
|
21
|
+
## use regex for match
|
22
|
+
GROUP_RE.match?( text )
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
ROUND_RE = %r{^(
|
29
|
+
|
30
|
+
# round - note - requiers number e.g. round 1,2, etc.
|
31
|
+
(?: (?: Round |
|
32
|
+
Matchday |
|
33
|
+
Week
|
34
|
+
)
|
35
|
+
[ ] [0-9]+
|
36
|
+
)
|
37
|
+
|
|
38
|
+
# more (kockout) rounds
|
39
|
+
# playoffs - playoff, play-off, play-offs
|
40
|
+
(?: Play-?offs?
|
41
|
+
(?: [ ]for[ ]quarter-?finals )?
|
42
|
+
)
|
43
|
+
|
|
44
|
+
# round32
|
45
|
+
(?: Round[ ]of[ ]32 |
|
46
|
+
Last[ ]32 )
|
47
|
+
|
|
48
|
+
# round16
|
49
|
+
(?: Round[ ]of[ ]16 |
|
50
|
+
Last[ ]16 |
|
51
|
+
8th[ ]finals )
|
52
|
+
|
|
53
|
+
# fifthplace
|
54
|
+
(?:
|
55
|
+
(?: (Fifth|5th)[ -]place
|
56
|
+
(?: [ ] (?: match|play-?off|final ))?
|
57
|
+
) |
|
58
|
+
(?: Match[ ]for[ ](?: fifth|5th )[ -]place )
|
59
|
+
)
|
60
|
+
|
|
61
|
+
# thirdplace
|
62
|
+
(?:
|
63
|
+
(?: (Third|3rd)[ -]place
|
64
|
+
(?: [ ] (?: match|play-?off|final ))?
|
65
|
+
) |
|
66
|
+
(?: Match[ ]for[ ](?: third|3rd )[ -]place )
|
67
|
+
)
|
68
|
+
|
|
69
|
+
# quarterfinals
|
70
|
+
(?:
|
71
|
+
Quarter-?finals? |
|
72
|
+
Quarters |
|
73
|
+
Last[ ]8
|
74
|
+
)
|
75
|
+
|
|
76
|
+
# semifinals
|
77
|
+
(?:
|
78
|
+
Semi-?finals? |
|
79
|
+
Semis |
|
80
|
+
Last[ ]4
|
81
|
+
)
|
82
|
+
|
|
83
|
+
# final
|
84
|
+
Finals?
|
85
|
+
|
86
|
+
)$}ix
|
87
|
+
|
88
|
+
|
89
|
+
def is_round?( text )
|
90
|
+
ROUND_RE.match?( text )
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
## keep leg separate (from round) - why? why not?
|
95
|
+
##
|
96
|
+
LEG_RE = %r{^
|
97
|
+
# leg1
|
98
|
+
(?: 1st|First)[ ]leg
|
99
|
+
|
|
100
|
+
# leg2
|
101
|
+
(?: 2nd|Second)[ ]leg
|
102
|
+
$}ix
|
103
|
+
|
104
|
+
### Pair matches/games if marked with leg1 n leg2
|
105
|
+
def is_leg?( text )
|
106
|
+
LEG_RE.match?( text )
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
end # class Parser
|
111
|
+
end # module SportDb
|
@@ -0,0 +1,153 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
###
|
6
|
+
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
+
class Linter
|
8
|
+
|
9
|
+
def self.debug=(value) @@debug = value; end
|
10
|
+
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
+
def debug?() self.class.debug?; end
|
12
|
+
|
13
|
+
## keep typed - why? why not?
|
14
|
+
## - used anywhere?
|
15
|
+
def self.typed=(value) @@typed = value; end
|
16
|
+
def self.typed?() @@typed ||= true; end ## note: default is TRUE
|
17
|
+
def typed?() self.class.typed?; end
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
attr_reader :errors
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@errors = []
|
25
|
+
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
def errors?() @errors.size > 0; end
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
34
|
+
## make sure mon feb 12 18:10 will not match
|
35
|
+
## allow 1. FC Köln etc.
|
36
|
+
## Mainz 05:
|
37
|
+
## limit to 30 chars max
|
38
|
+
## only allow chars incl. intl buut (NOT ()[]/;)
|
39
|
+
##
|
40
|
+
## Group A:
|
41
|
+
## Group B: - remove colon
|
42
|
+
## or lookup first
|
43
|
+
|
44
|
+
ATTRIB_RE = %r{^
|
45
|
+
[ ]*? # slurp leading spaces
|
46
|
+
(?<key>[^:|\]\[()\/; -]
|
47
|
+
[^:|\]\[()\/;]{0,30}
|
48
|
+
)
|
49
|
+
[ ]*? # slurp trailing spaces
|
50
|
+
:[ ]+
|
51
|
+
(?<value>.+)
|
52
|
+
[ ]*? # slurp trailing spaces
|
53
|
+
$
|
54
|
+
}ix
|
55
|
+
|
56
|
+
|
57
|
+
#########
|
58
|
+
## parse - false (default) - tokenize (only)
|
59
|
+
## - true - tokenize & parse
|
60
|
+
def read( path, parse: false )
|
61
|
+
nodes = OutlineReader.read( path )
|
62
|
+
|
63
|
+
## process nodes
|
64
|
+
h1 = nil
|
65
|
+
orphans = 0 ## track paragraphs's with no heading
|
66
|
+
|
67
|
+
attrib_found = false
|
68
|
+
|
69
|
+
|
70
|
+
nodes.each do |node|
|
71
|
+
type = node[0]
|
72
|
+
|
73
|
+
if type == :h1
|
74
|
+
h1 = node[1] ## get heading text
|
75
|
+
puts
|
76
|
+
puts " = Heading 1 >#{node[1]}<"
|
77
|
+
elsif type == :p
|
78
|
+
|
79
|
+
if h1.nil?
|
80
|
+
orphans += 1 ## only warn once
|
81
|
+
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
lines = node[1]
|
86
|
+
|
87
|
+
tree = []
|
88
|
+
lines.each_with_index do |line,i|
|
89
|
+
|
90
|
+
if debug?
|
91
|
+
puts
|
92
|
+
puts "line >#{line}<"
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
## skip new (experimental attrib syntax)
|
97
|
+
m = nil
|
98
|
+
if attrib_found == false &&
|
99
|
+
m=ATTRIB_RE.match( line )
|
100
|
+
## note: check attrib regex AFTER group def e.g.:
|
101
|
+
## Group A:
|
102
|
+
## Group B: etc.
|
103
|
+
## todo/fix - change Group A: to Group A etc.
|
104
|
+
## Group B: to Group B
|
105
|
+
attrib_found = true
|
106
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
if attrib_found
|
111
|
+
## check if line ends with dot
|
112
|
+
## if not slurp up lines to the next do!!!
|
113
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
114
|
+
attrib_found = false if line.end_with?( '.' )
|
115
|
+
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
t, error_messages = if parse
|
120
|
+
@parser.parse_with_errors( line )
|
121
|
+
else
|
122
|
+
@parser.tokenize_with_errors( line )
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
if error_messages.size > 0
|
127
|
+
## add to "global" error list
|
128
|
+
## make a triplet tuple (file / msg / line text)
|
129
|
+
error_messages.each do |msg|
|
130
|
+
@errors << [ path,
|
131
|
+
msg,
|
132
|
+
line
|
133
|
+
]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
pp t if debug?
|
138
|
+
|
139
|
+
tree << t
|
140
|
+
end
|
141
|
+
|
142
|
+
## pp tree
|
143
|
+
else
|
144
|
+
pp node
|
145
|
+
raise ArgumentError, "unsupported (node) type >#{type}<"
|
146
|
+
end
|
147
|
+
end # each node
|
148
|
+
end # read
|
149
|
+
end # class Linter
|
150
|
+
|
151
|
+
|
152
|
+
end # class Parser
|
153
|
+
end # module SportDb
|
@@ -0,0 +1,101 @@
|
|
1
|
+
|
2
|
+
###
|
3
|
+
## todo/fix - move to sportdb-parser - why? why not? !!!!!!
|
4
|
+
##
|
5
|
+
|
6
|
+
|
7
|
+
module SportDb
|
8
|
+
|
9
|
+
class OutlineReader
|
10
|
+
|
11
|
+
def self.debug=(value) @@debug = value; end
|
12
|
+
def self.debug?() @@debug ||= false; end
|
13
|
+
def debug?() self.class.debug?; end
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
18
|
+
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
19
|
+
parse( txt )
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.parse( txt )
|
23
|
+
new( txt ).parse
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize( txt )
|
27
|
+
@txt = txt
|
28
|
+
end
|
29
|
+
|
30
|
+
## note: skip "decorative" only heading e.g. ========
|
31
|
+
## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
|
32
|
+
HEADING_BLANK_RE = %r{\A
|
33
|
+
={1,}
|
34
|
+
\z}x
|
35
|
+
|
36
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
37
|
+
HEADING_RE = %r{\A
|
38
|
+
(?<marker>={1,}) ## 1. leading ======
|
39
|
+
[ ]*
|
40
|
+
(?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
|
41
|
+
[ ]*
|
42
|
+
=* ## 3. (optional) trailing ====
|
43
|
+
\z}x
|
44
|
+
|
45
|
+
def parse
|
46
|
+
outline=[] ## outline structure
|
47
|
+
start_para = true ## start new para(graph) on new text line?
|
48
|
+
|
49
|
+
@txt.each_line do |line|
|
50
|
+
line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
|
51
|
+
|
52
|
+
if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
|
53
|
+
start_para = true
|
54
|
+
next
|
55
|
+
end
|
56
|
+
|
57
|
+
break if line == '__END__'
|
58
|
+
|
59
|
+
next if line.start_with?( '#' ) ## skip comments too
|
60
|
+
## strip inline (until end-of-line) comments too
|
61
|
+
## e.g Eupen | KAS Eupen ## [de]
|
62
|
+
## => Eupen | KAS Eupen
|
63
|
+
## e.g bq Bonaire, BOE # CONCACAF
|
64
|
+
## => bq Bonaire, BOE
|
65
|
+
line = line.sub( /#.*/, '' ).strip
|
66
|
+
pp line if debug?
|
67
|
+
|
68
|
+
## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
|
69
|
+
next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
|
70
|
+
|
71
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
72
|
+
if m=HEADING_RE.match( line )
|
73
|
+
start_para = true
|
74
|
+
|
75
|
+
heading_marker = m[:marker]
|
76
|
+
heading_level = m[:marker].length ## count number of = for heading level
|
77
|
+
heading = m[:text].strip
|
78
|
+
|
79
|
+
puts "heading #{heading_level} >#{heading}<" if debug?
|
80
|
+
outline << [:"h#{heading_level}", heading]
|
81
|
+
else ## assume it's a (plain/regular) text line
|
82
|
+
if start_para
|
83
|
+
outline << [:p, [line]]
|
84
|
+
start_para = false
|
85
|
+
else
|
86
|
+
node = outline[-1] ## get last entry
|
87
|
+
if node[0] == :p ## assert it's a p(aragraph) node!!!
|
88
|
+
node[1] << line ## add line to p(aragraph)
|
89
|
+
else
|
90
|
+
puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
|
91
|
+
pp node
|
92
|
+
exit 1
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
outline
|
98
|
+
end # method read
|
99
|
+
end # class OutlineReader
|
100
|
+
|
101
|
+
end # module SportDb
|