sportdb-parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +14 -0
- data/README.md +8 -0
- data/Rakefile +27 -0
- data/bin/fbt +144 -0
- data/lib/sportdb/parser/lang.rb +111 -0
- data/lib/sportdb/parser/linter.rb +153 -0
- data/lib/sportdb/parser/outline_reader.rb +101 -0
- data/lib/sportdb/parser/parser.rb +196 -0
- data/lib/sportdb/parser/token-date.rb +193 -0
- data/lib/sportdb/parser/token-score.rb +121 -0
- data/lib/sportdb/parser/token-text.rb +114 -0
- data/lib/sportdb/parser/token.rb +364 -0
- data/lib/sportdb/parser.rb +44 -0
- metadata +96 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1466b82654b4a4f0f823a96709488dedb595d08731a55abc128691e0ffe2a80b
|
4
|
+
data.tar.gz: 14995e94dc079ab61e77d056d15c9a5830dc573129661ca453b2892d087c2061
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 75c2b4f455e8bb1b5e471c39f8fa3b5069bd0bb2a808ad8b246c0f2b060c5416f9f56a3619ad7db7ac5f21a6177c762aa28ae8e9c939b03a2569cf27d34f9b81
|
7
|
+
data.tar.gz: 9c4f9095a61410499ae7628b1eb3295d8f456e62feae45a4c254d9157904326abf6571f3c4a04c078551b6364cd09252509f709bfeef46a569dbe202f4058460
|
data/CHANGELOG.md
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
CHANGELOG.md
|
2
|
+
Manifest.txt
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
bin/fbt
|
6
|
+
lib/sportdb/parser.rb
|
7
|
+
lib/sportdb/parser/lang.rb
|
8
|
+
lib/sportdb/parser/linter.rb
|
9
|
+
lib/sportdb/parser/outline_reader.rb
|
10
|
+
lib/sportdb/parser/parser.rb
|
11
|
+
lib/sportdb/parser/token-date.rb
|
12
|
+
lib/sportdb/parser/token-score.rb
|
13
|
+
lib/sportdb/parser/token-text.rb
|
14
|
+
lib/sportdb/parser/token.rb
|
data/README.md
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'hoe'
|
2
|
+
|
3
|
+
|
4
|
+
Hoe.spec 'sportdb-parser' do
|
5
|
+
|
6
|
+
self.version = '0.0.1'
|
7
|
+
|
8
|
+
self.summary = "sportdb-parser - football.txt match parser (& tokenizer)"
|
9
|
+
self.description = summary
|
10
|
+
|
11
|
+
self.urls = { home: 'https://github.com/sportdb/sport.db' }
|
12
|
+
|
13
|
+
self.author = 'Gerald Bauer'
|
14
|
+
self.email = 'gerald.bauer@gmail.com'
|
15
|
+
|
16
|
+
# switch extension to .markdown for gihub formatting
|
17
|
+
self.readme_file = 'README.md'
|
18
|
+
self.history_file = 'CHANGELOG.md'
|
19
|
+
|
20
|
+
self.licenses = ['Public Domain']
|
21
|
+
|
22
|
+
self.extra_deps = []
|
23
|
+
|
24
|
+
self.spec_extras = {
|
25
|
+
required_ruby_version: '>= 2.2.2'
|
26
|
+
}
|
27
|
+
end
|
data/bin/fbt
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
## tip: to test run:
|
4
|
+
## ruby -I ./lib bin/fbt
|
5
|
+
|
6
|
+
require 'sportdb/parser'
|
7
|
+
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
##
|
12
|
+
## read textfile
|
13
|
+
## and dump tokens
|
14
|
+
##
|
15
|
+
## fbt ../openfootball/.../euro.txt
|
16
|
+
|
17
|
+
|
18
|
+
SEASON_RE = %r{ (?:
|
19
|
+
\d{4}-\d{2}
|
20
|
+
| \d{4}(--[a-z0-9_-]+)?
|
21
|
+
)
|
22
|
+
}x
|
23
|
+
SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
|
24
|
+
|
25
|
+
|
26
|
+
## note: if pattern includes directory add here
|
27
|
+
## (otherwise move to more "generic" datafile) - why? why not?
|
28
|
+
MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
29
|
+
#{SEASON}
|
30
|
+
/[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
|
31
|
+
}x
|
32
|
+
|
33
|
+
|
34
|
+
def find( path, pattern=MATCH_RE )
|
35
|
+
datafiles = []
|
36
|
+
|
37
|
+
## check all txt files
|
38
|
+
## note: incl. files starting with dot (.)) as candidates (normally excluded with just *)
|
39
|
+
candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
|
40
|
+
## pp candidates
|
41
|
+
candidates.each do |candidate|
|
42
|
+
datafiles << candidate if pattern.match( candidate )
|
43
|
+
end
|
44
|
+
|
45
|
+
## pp datafiles
|
46
|
+
datafiles
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
args = ARGV
|
55
|
+
opts = { debug: false,
|
56
|
+
metal: false }
|
57
|
+
|
58
|
+
parser = OptionParser.new do |parser|
|
59
|
+
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
60
|
+
|
61
|
+
##
|
62
|
+
## check if git has a offline option?? (use same)
|
63
|
+
## check for other tools - why? why not?
|
64
|
+
|
65
|
+
|
66
|
+
parser.on( "--verbose", "--debug",
|
67
|
+
"turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
|
68
|
+
opts[:debug] = debug
|
69
|
+
end
|
70
|
+
|
71
|
+
parser.on( "--metal",
|
72
|
+
"turn off typed parse tree; show to the metal tokens"+
|
73
|
+
" (default: #{opts[:metal]})" ) do |metal|
|
74
|
+
opts[:metal] = metal
|
75
|
+
end
|
76
|
+
end
|
77
|
+
parser.parse!( args )
|
78
|
+
|
79
|
+
puts "OPTS:"
|
80
|
+
p opts
|
81
|
+
puts "ARGV:"
|
82
|
+
p args
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
def expand_args( args )
|
90
|
+
paths = []
|
91
|
+
|
92
|
+
args.each do |arg|
|
93
|
+
## check if directory
|
94
|
+
if Dir.exist?( arg )
|
95
|
+
datafiles = find( arg )
|
96
|
+
puts
|
97
|
+
puts " found #{datafiles.size} match txt datafiles in #{arg}"
|
98
|
+
pp datafiles
|
99
|
+
paths += datafiles
|
100
|
+
else
|
101
|
+
## assume it's a file
|
102
|
+
paths << arg
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
paths
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
paths = if args.empty?
|
111
|
+
[
|
112
|
+
'../../../openfootball/euro/2020--europe/euro.txt',
|
113
|
+
'../../../openfootball/euro/2024--germany/euro.txt',
|
114
|
+
]
|
115
|
+
else
|
116
|
+
## check for directories
|
117
|
+
## and auto-expand
|
118
|
+
|
119
|
+
expand_args( args )
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
SportDb::Parser::Linter.debug = true if opts[:debug]
|
125
|
+
|
126
|
+
linter = SportDb::Parser::Linter.new
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
paths.each_with_index do |path,i|
|
131
|
+
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
132
|
+
linter.read( path, parse: !opts[:metal] )
|
133
|
+
end
|
134
|
+
|
135
|
+
if linter.errors?
|
136
|
+
puts
|
137
|
+
pp linter.errors
|
138
|
+
puts "!! #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
139
|
+
else
|
140
|
+
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
141
|
+
end
|
142
|
+
|
143
|
+
puts "bye"
|
144
|
+
|
@@ -0,0 +1,111 @@
|
|
1
|
+
|
2
|
+
## use Sports (not SportDb) for module - why? why not?
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
module SportDb
|
7
|
+
class Parser
|
8
|
+
|
9
|
+
## Group A-Z
|
10
|
+
## Group 1-99
|
11
|
+
## Group HEX # used in concaf world cup quali
|
12
|
+
## Group 1A or A1, B1 - used anywhere
|
13
|
+
##
|
14
|
+
## use "key" of group - why? why not?
|
15
|
+
|
16
|
+
GROUP_RE = %r{^
|
17
|
+
Group [ ]
|
18
|
+
(?<key>[a-z0-9]+)
|
19
|
+
$}ix
|
20
|
+
def is_group?( text )
|
21
|
+
## use regex for match
|
22
|
+
GROUP_RE.match?( text )
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
ROUND_RE = %r{^(
|
29
|
+
|
30
|
+
# round - note - requiers number e.g. round 1,2, etc.
|
31
|
+
(?: (?: Round |
|
32
|
+
Matchday |
|
33
|
+
Week
|
34
|
+
)
|
35
|
+
[ ] [0-9]+
|
36
|
+
)
|
37
|
+
|
|
38
|
+
# more (kockout) rounds
|
39
|
+
# playoffs - playoff, play-off, play-offs
|
40
|
+
(?: Play-?offs?
|
41
|
+
(?: [ ]for[ ]quarter-?finals )?
|
42
|
+
)
|
43
|
+
|
|
44
|
+
# round32
|
45
|
+
(?: Round[ ]of[ ]32 |
|
46
|
+
Last[ ]32 )
|
47
|
+
|
|
48
|
+
# round16
|
49
|
+
(?: Round[ ]of[ ]16 |
|
50
|
+
Last[ ]16 |
|
51
|
+
8th[ ]finals )
|
52
|
+
|
|
53
|
+
# fifthplace
|
54
|
+
(?:
|
55
|
+
(?: (Fifth|5th)[ -]place
|
56
|
+
(?: [ ] (?: match|play-?off|final ))?
|
57
|
+
) |
|
58
|
+
(?: Match[ ]for[ ](?: fifth|5th )[ -]place )
|
59
|
+
)
|
60
|
+
|
|
61
|
+
# thirdplace
|
62
|
+
(?:
|
63
|
+
(?: (Third|3rd)[ -]place
|
64
|
+
(?: [ ] (?: match|play-?off|final ))?
|
65
|
+
) |
|
66
|
+
(?: Match[ ]for[ ](?: third|3rd )[ -]place )
|
67
|
+
)
|
68
|
+
|
|
69
|
+
# quarterfinals
|
70
|
+
(?:
|
71
|
+
Quarter-?finals? |
|
72
|
+
Quarters |
|
73
|
+
Last[ ]8
|
74
|
+
)
|
75
|
+
|
|
76
|
+
# semifinals
|
77
|
+
(?:
|
78
|
+
Semi-?finals? |
|
79
|
+
Semis |
|
80
|
+
Last[ ]4
|
81
|
+
)
|
82
|
+
|
|
83
|
+
# final
|
84
|
+
Finals?
|
85
|
+
|
86
|
+
)$}ix
|
87
|
+
|
88
|
+
|
89
|
+
def is_round?( text )
|
90
|
+
ROUND_RE.match?( text )
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
## keep leg separate (from round) - why? why not?
|
95
|
+
##
|
96
|
+
LEG_RE = %r{^
|
97
|
+
# leg1
|
98
|
+
(?: 1st|First)[ ]leg
|
99
|
+
|
|
100
|
+
# leg2
|
101
|
+
(?: 2nd|Second)[ ]leg
|
102
|
+
$}ix
|
103
|
+
|
104
|
+
### Pair matches/games if marked with leg1 n leg2
|
105
|
+
def is_leg?( text )
|
106
|
+
LEG_RE.match?( text )
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
end # class Parser
|
111
|
+
end # module SportDb
|
@@ -0,0 +1,153 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
###
|
6
|
+
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
+
class Linter
|
8
|
+
|
9
|
+
def self.debug=(value) @@debug = value; end
|
10
|
+
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
+
def debug?() self.class.debug?; end
|
12
|
+
|
13
|
+
## keep typed - why? why not?
|
14
|
+
## - used anywhere?
|
15
|
+
def self.typed=(value) @@typed = value; end
|
16
|
+
def self.typed?() @@typed ||= true; end ## note: default is TRUE
|
17
|
+
def typed?() self.class.typed?; end
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
attr_reader :errors
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@errors = []
|
25
|
+
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
def errors?() @errors.size > 0; end
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
34
|
+
## make sure mon feb 12 18:10 will not match
|
35
|
+
## allow 1. FC Köln etc.
|
36
|
+
## Mainz 05:
|
37
|
+
## limit to 30 chars max
|
38
|
+
## only allow chars incl. intl buut (NOT ()[]/;)
|
39
|
+
##
|
40
|
+
## Group A:
|
41
|
+
## Group B: - remove colon
|
42
|
+
## or lookup first
|
43
|
+
|
44
|
+
ATTRIB_RE = %r{^
|
45
|
+
[ ]*? # slurp leading spaces
|
46
|
+
(?<key>[^:|\]\[()\/; -]
|
47
|
+
[^:|\]\[()\/;]{0,30}
|
48
|
+
)
|
49
|
+
[ ]*? # slurp trailing spaces
|
50
|
+
:[ ]+
|
51
|
+
(?<value>.+)
|
52
|
+
[ ]*? # slurp trailing spaces
|
53
|
+
$
|
54
|
+
}ix
|
55
|
+
|
56
|
+
|
57
|
+
#########
|
58
|
+
## parse - false (default) - tokenize (only)
|
59
|
+
## - true - tokenize & parse
|
60
|
+
def read( path, parse: false )
|
61
|
+
nodes = OutlineReader.read( path )
|
62
|
+
|
63
|
+
## process nodes
|
64
|
+
h1 = nil
|
65
|
+
orphans = 0 ## track paragraphs's with no heading
|
66
|
+
|
67
|
+
attrib_found = false
|
68
|
+
|
69
|
+
|
70
|
+
nodes.each do |node|
|
71
|
+
type = node[0]
|
72
|
+
|
73
|
+
if type == :h1
|
74
|
+
h1 = node[1] ## get heading text
|
75
|
+
puts
|
76
|
+
puts " = Heading 1 >#{node[1]}<"
|
77
|
+
elsif type == :p
|
78
|
+
|
79
|
+
if h1.nil?
|
80
|
+
orphans += 1 ## only warn once
|
81
|
+
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
lines = node[1]
|
86
|
+
|
87
|
+
tree = []
|
88
|
+
lines.each_with_index do |line,i|
|
89
|
+
|
90
|
+
if debug?
|
91
|
+
puts
|
92
|
+
puts "line >#{line}<"
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
## skip new (experimental attrib syntax)
|
97
|
+
m = nil
|
98
|
+
if attrib_found == false &&
|
99
|
+
m=ATTRIB_RE.match( line )
|
100
|
+
## note: check attrib regex AFTER group def e.g.:
|
101
|
+
## Group A:
|
102
|
+
## Group B: etc.
|
103
|
+
## todo/fix - change Group A: to Group A etc.
|
104
|
+
## Group B: to Group B
|
105
|
+
attrib_found = true
|
106
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
if attrib_found
|
111
|
+
## check if line ends with dot
|
112
|
+
## if not slurp up lines to the next do!!!
|
113
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
114
|
+
attrib_found = false if line.end_with?( '.' )
|
115
|
+
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
t, error_messages = if parse
|
120
|
+
@parser.parse_with_errors( line )
|
121
|
+
else
|
122
|
+
@parser.tokenize_with_errors( line )
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
if error_messages.size > 0
|
127
|
+
## add to "global" error list
|
128
|
+
## make a triplet tuple (file / msg / line text)
|
129
|
+
error_messages.each do |msg|
|
130
|
+
@errors << [ path,
|
131
|
+
msg,
|
132
|
+
line
|
133
|
+
]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
pp t if debug?
|
138
|
+
|
139
|
+
tree << t
|
140
|
+
end
|
141
|
+
|
142
|
+
## pp tree
|
143
|
+
else
|
144
|
+
pp node
|
145
|
+
raise ArgumentError, "unsupported (node) type >#{type}<"
|
146
|
+
end
|
147
|
+
end # each node
|
148
|
+
end # read
|
149
|
+
end # class Linter
|
150
|
+
|
151
|
+
|
152
|
+
end # class Parser
|
153
|
+
end # module SportDb
|
@@ -0,0 +1,101 @@
|
|
1
|
+
|
2
|
+
###
|
3
|
+
## todo/fix - move to sportdb-parser - why? why not? !!!!!!
|
4
|
+
##
|
5
|
+
|
6
|
+
|
7
|
+
module SportDb
|
8
|
+
|
9
|
+
class OutlineReader
|
10
|
+
|
11
|
+
def self.debug=(value) @@debug = value; end
|
12
|
+
def self.debug?() @@debug ||= false; end
|
13
|
+
def debug?() self.class.debug?; end
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
18
|
+
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
19
|
+
parse( txt )
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.parse( txt )
|
23
|
+
new( txt ).parse
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize( txt )
|
27
|
+
@txt = txt
|
28
|
+
end
|
29
|
+
|
30
|
+
## note: skip "decorative" only heading e.g. ========
|
31
|
+
## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
|
32
|
+
HEADING_BLANK_RE = %r{\A
|
33
|
+
={1,}
|
34
|
+
\z}x
|
35
|
+
|
36
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
37
|
+
HEADING_RE = %r{\A
|
38
|
+
(?<marker>={1,}) ## 1. leading ======
|
39
|
+
[ ]*
|
40
|
+
(?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
|
41
|
+
[ ]*
|
42
|
+
=* ## 3. (optional) trailing ====
|
43
|
+
\z}x
|
44
|
+
|
45
|
+
def parse
|
46
|
+
outline=[] ## outline structure
|
47
|
+
start_para = true ## start new para(graph) on new text line?
|
48
|
+
|
49
|
+
@txt.each_line do |line|
|
50
|
+
line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
|
51
|
+
|
52
|
+
if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
|
53
|
+
start_para = true
|
54
|
+
next
|
55
|
+
end
|
56
|
+
|
57
|
+
break if line == '__END__'
|
58
|
+
|
59
|
+
next if line.start_with?( '#' ) ## skip comments too
|
60
|
+
## strip inline (until end-of-line) comments too
|
61
|
+
## e.g Eupen | KAS Eupen ## [de]
|
62
|
+
## => Eupen | KAS Eupen
|
63
|
+
## e.g bq Bonaire, BOE # CONCACAF
|
64
|
+
## => bq Bonaire, BOE
|
65
|
+
line = line.sub( /#.*/, '' ).strip
|
66
|
+
pp line if debug?
|
67
|
+
|
68
|
+
## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
|
69
|
+
next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
|
70
|
+
|
71
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
72
|
+
if m=HEADING_RE.match( line )
|
73
|
+
start_para = true
|
74
|
+
|
75
|
+
heading_marker = m[:marker]
|
76
|
+
heading_level = m[:marker].length ## count number of = for heading level
|
77
|
+
heading = m[:text].strip
|
78
|
+
|
79
|
+
puts "heading #{heading_level} >#{heading}<" if debug?
|
80
|
+
outline << [:"h#{heading_level}", heading]
|
81
|
+
else ## assume it's a (plain/regular) text line
|
82
|
+
if start_para
|
83
|
+
outline << [:p, [line]]
|
84
|
+
start_para = false
|
85
|
+
else
|
86
|
+
node = outline[-1] ## get last entry
|
87
|
+
if node[0] == :p ## assert it's a p(aragraph) node!!!
|
88
|
+
node[1] << line ## add line to p(aragraph)
|
89
|
+
else
|
90
|
+
puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
|
91
|
+
pp node
|
92
|
+
exit 1
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
outline
|
98
|
+
end # method read
|
99
|
+
end # class OutlineReader
|
100
|
+
|
101
|
+
end # module SportDb
|