sportdb-parser 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -4
- data/lib/sportdb/parser/token-date.rb +29 -0
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +0 -5
- metadata +3 -8
- data/bin/fbt +0 -94
- data/lib/sportdb/parser/linter.rb +0 -149
- data/lib/sportdb/parser/opts.rb +0 -70
- data/lib/sportdb/parser/outline_reader.rb +0 -97
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c9225b21f400b9f9cced2052c3062f41a091ed81d3d4239164c9652f53ebc6e
|
4
|
+
data.tar.gz: f7250eaa21324962df27e7cdd397857afa570c610f00c80c31e5105e40964002
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 471c938c233d8f81d7a0fd5e4470a27a52486906764816b6c35ea3d88e19650c81302fd5ff9ee30b85d3a8e9f81ada8eef20b49bd3de924c7238acb106ba6082
|
7
|
+
data.tar.gz: 24d1cf3846404859ad7e751895325b256321d43e2881413fda6325c744ca0c31b52ef2032a9dfc8e56e67d7a06df54a6d2780a297982440b8e40b7055fe06c26
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -2,12 +2,8 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
-
bin/fbt
|
6
5
|
lib/sportdb/parser.rb
|
7
6
|
lib/sportdb/parser/lang.rb
|
8
|
-
lib/sportdb/parser/linter.rb
|
9
|
-
lib/sportdb/parser/opts.rb
|
10
|
-
lib/sportdb/parser/outline_reader.rb
|
11
7
|
lib/sportdb/parser/parser.rb
|
12
8
|
lib/sportdb/parser/token-date.rb
|
13
9
|
lib/sportdb/parser/token-score.rb
|
@@ -155,6 +155,35 @@ DATE_RE = Regexp.union(
|
|
155
155
|
)
|
156
156
|
|
157
157
|
|
158
|
+
##
|
159
|
+
## add a date parser helper
|
160
|
+
def self.parse_date( str, start: )
|
161
|
+
if m=DATE_RE.match( str )
|
162
|
+
|
163
|
+
year = m[:year].to_i(10) if m[:year]
|
164
|
+
month = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
165
|
+
day = m[:day].to_i(10) if m[:day]
|
166
|
+
wday = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
167
|
+
|
168
|
+
if year.nil? ## try to calculate year
|
169
|
+
year = if month > start.month ||
|
170
|
+
(month == start.month && day >= start.day)
|
171
|
+
# assume same year as start_at event (e.g. 2013 for 2013/14 season)
|
172
|
+
start.year
|
173
|
+
else
|
174
|
+
# assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
|
175
|
+
start.year+1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
Date.new( year,month,day )
|
179
|
+
else
|
180
|
+
puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
|
181
|
+
exit 1
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
|
186
|
+
|
158
187
|
###
|
159
188
|
# date duration
|
160
189
|
# use - or + as separator
|
data/lib/sportdb/parser.rb
CHANGED
@@ -24,11 +24,6 @@ require_relative 'parser/lang'
|
|
24
24
|
require_relative 'parser/parser'
|
25
25
|
|
26
26
|
|
27
|
-
## more
|
28
|
-
require_relative 'parser/outline_reader'
|
29
|
-
require_relative 'parser/linter'
|
30
|
-
require_relative 'parser/opts'
|
31
|
-
|
32
27
|
|
33
28
|
###
|
34
29
|
# make parser api (easily) available - why? why not?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -74,8 +74,7 @@ dependencies:
|
|
74
74
|
version: '4.1'
|
75
75
|
description: sportdb-parser - football.txt match parser (& tokenizer)
|
76
76
|
email: gerald.bauer@gmail.com
|
77
|
-
executables:
|
78
|
-
- fbt
|
77
|
+
executables: []
|
79
78
|
extensions: []
|
80
79
|
extra_rdoc_files:
|
81
80
|
- CHANGELOG.md
|
@@ -86,12 +85,8 @@ files:
|
|
86
85
|
- Manifest.txt
|
87
86
|
- README.md
|
88
87
|
- Rakefile
|
89
|
-
- bin/fbt
|
90
88
|
- lib/sportdb/parser.rb
|
91
89
|
- lib/sportdb/parser/lang.rb
|
92
|
-
- lib/sportdb/parser/linter.rb
|
93
|
-
- lib/sportdb/parser/opts.rb
|
94
|
-
- lib/sportdb/parser/outline_reader.rb
|
95
90
|
- lib/sportdb/parser/parser.rb
|
96
91
|
- lib/sportdb/parser/token-date.rb
|
97
92
|
- lib/sportdb/parser/token-score.rb
|
data/bin/fbt
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
## tip: to test run:
|
4
|
-
## ruby -I ./lib bin/fbt
|
5
|
-
|
6
|
-
## our own code
|
7
|
-
require 'sportdb/parser'
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
require 'optparse'
|
12
|
-
|
13
|
-
##
|
14
|
-
## read textfile
|
15
|
-
## and dump tokens
|
16
|
-
##
|
17
|
-
## fbt ../openfootball/.../euro.txt
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
args = ARGV
|
23
|
-
opts = { debug: false,
|
24
|
-
metal: false }
|
25
|
-
|
26
|
-
parser = OptionParser.new do |parser|
|
27
|
-
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
28
|
-
|
29
|
-
##
|
30
|
-
## check if git has a offline option?? (use same)
|
31
|
-
## check for other tools - why? why not?
|
32
|
-
|
33
|
-
|
34
|
-
parser.on( "--verbose", "--debug",
|
35
|
-
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
36
|
-
opts[:debug] = debug
|
37
|
-
end
|
38
|
-
|
39
|
-
parser.on( "--metal",
|
40
|
-
"turn off typed parse tree; show to the metal tokens"+
|
41
|
-
" (default: #{opts[:metal]})" ) do |metal|
|
42
|
-
opts[:metal] = metal
|
43
|
-
end
|
44
|
-
end
|
45
|
-
parser.parse!( args )
|
46
|
-
|
47
|
-
puts "OPTS:"
|
48
|
-
p opts
|
49
|
-
puts "ARGV:"
|
50
|
-
p args
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
paths = if args.empty?
|
57
|
-
[
|
58
|
-
'../../../openfootball/euro/2021--europe/euro.txt',
|
59
|
-
'../../../openfootball/euro/2024--germany/euro.txt',
|
60
|
-
]
|
61
|
-
else
|
62
|
-
## check for directories
|
63
|
-
## and auto-expand
|
64
|
-
|
65
|
-
SportDb::Parser::Opts.expand_args( args )
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
SportDb::Parser::Linter.debug = true if opts[:debug]
|
71
|
-
|
72
|
-
linter = SportDb::Parser::Linter.new
|
73
|
-
|
74
|
-
errors = []
|
75
|
-
|
76
|
-
paths.each_with_index do |path,i|
|
77
|
-
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
78
|
-
linter.read( path, parse: !opts[:metal] )
|
79
|
-
|
80
|
-
errors += linter.errors if linter.errors?
|
81
|
-
end
|
82
|
-
|
83
|
-
if errors.size > 0
|
84
|
-
puts
|
85
|
-
pp errors
|
86
|
-
puts
|
87
|
-
puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
88
|
-
else
|
89
|
-
puts
|
90
|
-
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
91
|
-
end
|
92
|
-
|
93
|
-
puts "bye"
|
94
|
-
|
@@ -1,149 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
###
|
6
|
-
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
-
class Linter
|
8
|
-
|
9
|
-
def self.debug=(value) @@debug = value; end
|
10
|
-
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
-
def debug?() self.class.debug?; end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
attr_reader :errors
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@errors = []
|
19
|
-
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
def errors?() @errors.size > 0; end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
## note: colon (:) MUST be followed by one (or more) spaces
|
28
|
-
## make sure mon feb 12 18:10 will not match
|
29
|
-
## allow 1. FC Köln etc.
|
30
|
-
## Mainz 05:
|
31
|
-
## limit to 30 chars max
|
32
|
-
## only allow chars incl. intl buut (NOT ()[]/;)
|
33
|
-
##
|
34
|
-
## Group A:
|
35
|
-
## Group B: - remove colon
|
36
|
-
## or lookup first
|
37
|
-
|
38
|
-
ATTRIB_RE = %r{^
|
39
|
-
[ ]*? # slurp leading spaces
|
40
|
-
(?<key>[^:|\]\[()\/; -]
|
41
|
-
[^:|\]\[()\/;]{0,30}
|
42
|
-
)
|
43
|
-
[ ]*? # slurp trailing spaces
|
44
|
-
:[ ]+
|
45
|
-
(?<value>.+)
|
46
|
-
[ ]*? # slurp trailing spaces
|
47
|
-
$
|
48
|
-
}ix
|
49
|
-
|
50
|
-
|
51
|
-
#########
|
52
|
-
## parse - false (default) - tokenize (only)
|
53
|
-
## - true - tokenize & parse
|
54
|
-
def read( path, parse: false )
|
55
|
-
## note: every (new) read call - resets errors list to empty
|
56
|
-
@errors = []
|
57
|
-
|
58
|
-
nodes = OutlineReader.read( path )
|
59
|
-
|
60
|
-
## process nodes
|
61
|
-
h1 = nil
|
62
|
-
orphans = 0 ## track paragraphs's with no heading
|
63
|
-
|
64
|
-
attrib_found = false
|
65
|
-
|
66
|
-
|
67
|
-
nodes.each do |node|
|
68
|
-
type = node[0]
|
69
|
-
|
70
|
-
if type == :h1
|
71
|
-
h1 = node[1] ## get heading text
|
72
|
-
puts
|
73
|
-
puts " = Heading 1 >#{node[1]}<"
|
74
|
-
elsif type == :p
|
75
|
-
|
76
|
-
if h1.nil?
|
77
|
-
orphans += 1 ## only warn once
|
78
|
-
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
79
|
-
next
|
80
|
-
end
|
81
|
-
|
82
|
-
lines = node[1]
|
83
|
-
|
84
|
-
tree = []
|
85
|
-
lines.each_with_index do |line,i|
|
86
|
-
|
87
|
-
if debug?
|
88
|
-
puts
|
89
|
-
puts "line >#{line}<"
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
## skip new (experimental attrib syntax)
|
94
|
-
if attrib_found == false &&
|
95
|
-
ATTRIB_RE.match?( line )
|
96
|
-
## note: check attrib regex AFTER group def e.g.:
|
97
|
-
## Group A:
|
98
|
-
## Group B: etc.
|
99
|
-
## todo/fix - change Group A: to Group A etc.
|
100
|
-
## Group B: to Group B
|
101
|
-
attrib_found = true
|
102
|
-
## logger.debug "skipping key/value line - >#{line}<"
|
103
|
-
next
|
104
|
-
end
|
105
|
-
|
106
|
-
if attrib_found
|
107
|
-
## check if line ends with dot
|
108
|
-
## if not slurp up lines to the next do!!!
|
109
|
-
## logger.debug "skipping key/value line - >#{line}<"
|
110
|
-
attrib_found = false if line.end_with?( '.' )
|
111
|
-
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
112
|
-
next
|
113
|
-
end
|
114
|
-
|
115
|
-
t, error_messages = if parse
|
116
|
-
@parser.parse_with_errors( line )
|
117
|
-
else
|
118
|
-
@parser.tokenize_with_errors( line )
|
119
|
-
end
|
120
|
-
|
121
|
-
|
122
|
-
if error_messages.size > 0
|
123
|
-
## add to "global" error list
|
124
|
-
## make a triplet tuple (file / msg / line text)
|
125
|
-
error_messages.each do |msg|
|
126
|
-
@errors << [ path,
|
127
|
-
msg,
|
128
|
-
line
|
129
|
-
]
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
pp t if debug?
|
134
|
-
|
135
|
-
tree << t
|
136
|
-
end
|
137
|
-
|
138
|
-
## pp tree
|
139
|
-
else
|
140
|
-
pp node
|
141
|
-
raise ArgumentError, "unsupported (node) type >#{type}<"
|
142
|
-
end
|
143
|
-
end # each node
|
144
|
-
end # read
|
145
|
-
end # class Linter
|
146
|
-
|
147
|
-
|
148
|
-
end # class Parser
|
149
|
-
end # module SportDb
|
data/lib/sportdb/parser/opts.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
###
|
6
|
-
## note - Opts Helpers for now nested inside Parser - keep here? why? why not?
|
7
|
-
class Opts
|
8
|
-
|
9
|
-
SEASON_RE = %r{ (?:
|
10
|
-
\d{4}-\d{2}
|
11
|
-
| \d{4}(--[a-z0-9_-]+)?
|
12
|
-
)
|
13
|
-
}x
|
14
|
-
SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
|
15
|
-
|
16
|
-
|
17
|
-
## note: if pattern includes directory add here
|
18
|
-
## (otherwise move to more "generic" datafile) - why? why not?
|
19
|
-
MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
20
|
-
#{SEASON}
|
21
|
-
/[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
|
22
|
-
}x
|
23
|
-
|
24
|
-
|
25
|
-
def self.find( path )
|
26
|
-
datafiles = []
|
27
|
-
|
28
|
-
## note: normalize path - use File.expand_path ??
|
29
|
-
## change all backslash to slash for now
|
30
|
-
## path = path.gsub( "\\", '/' )
|
31
|
-
path = File.expand_path( path )
|
32
|
-
|
33
|
-
## check all txt files
|
34
|
-
## note: incl. files starting with dot (.)) as candidates
|
35
|
-
## (normally excluded with just *)
|
36
|
-
candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
|
37
|
-
## pp candidates
|
38
|
-
candidates.each do |candidate|
|
39
|
-
datafiles << candidate if MATCH_RE.match( candidate )
|
40
|
-
end
|
41
|
-
|
42
|
-
## pp datafiles
|
43
|
-
datafiles
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
def self.expand_args( args )
|
48
|
-
paths = []
|
49
|
-
|
50
|
-
args.each do |arg|
|
51
|
-
## check if directory
|
52
|
-
if Dir.exist?( arg )
|
53
|
-
datafiles = find( arg )
|
54
|
-
puts
|
55
|
-
puts " found #{datafiles.size} match txt datafiles in #{arg}"
|
56
|
-
pp datafiles
|
57
|
-
paths += datafiles
|
58
|
-
else
|
59
|
-
## assume it's a file
|
60
|
-
paths << arg
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
paths
|
65
|
-
end
|
66
|
-
end # class Opts
|
67
|
-
|
68
|
-
|
69
|
-
end # class Parser
|
70
|
-
end # module SportDb
|
@@ -1,97 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
|
5
|
-
class OutlineReader
|
6
|
-
|
7
|
-
def self.debug=(value) @@debug = value; end
|
8
|
-
def self.debug?() @@debug ||= false; end
|
9
|
-
def debug?() self.class.debug?; end
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
-
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
15
|
-
parse( txt )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.parse( txt )
|
19
|
-
new( txt ).parse
|
20
|
-
end
|
21
|
-
|
22
|
-
def initialize( txt )
|
23
|
-
@txt = txt
|
24
|
-
end
|
25
|
-
|
26
|
-
## note: skip "decorative" only heading e.g. ========
|
27
|
-
## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
|
28
|
-
HEADING_BLANK_RE = %r{\A
|
29
|
-
={1,}
|
30
|
-
\z}x
|
31
|
-
|
32
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
33
|
-
HEADING_RE = %r{\A
|
34
|
-
(?<marker>={1,}) ## 1. leading ======
|
35
|
-
[ ]*
|
36
|
-
(?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
|
37
|
-
[ ]*
|
38
|
-
=* ## 3. (optional) trailing ====
|
39
|
-
\z}x
|
40
|
-
|
41
|
-
def parse
|
42
|
-
outline=[] ## outline structure
|
43
|
-
start_para = true ## start new para(graph) on new text line?
|
44
|
-
|
45
|
-
@txt.each_line do |line|
|
46
|
-
line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
|
47
|
-
|
48
|
-
if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
|
49
|
-
start_para = true
|
50
|
-
next
|
51
|
-
end
|
52
|
-
|
53
|
-
break if line == '__END__'
|
54
|
-
|
55
|
-
next if line.start_with?( '#' ) ## skip comments too
|
56
|
-
## strip inline (until end-of-line) comments too
|
57
|
-
## e.g Eupen | KAS Eupen ## [de]
|
58
|
-
## => Eupen | KAS Eupen
|
59
|
-
## e.g bq Bonaire, BOE # CONCACAF
|
60
|
-
## => bq Bonaire, BOE
|
61
|
-
line = line.sub( /#.*/, '' ).strip
|
62
|
-
pp line if debug?
|
63
|
-
|
64
|
-
## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
|
65
|
-
next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
|
66
|
-
|
67
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
68
|
-
if m=HEADING_RE.match( line )
|
69
|
-
start_para = true
|
70
|
-
|
71
|
-
heading_marker = m[:marker]
|
72
|
-
heading_level = heading_marker.length ## count number of = for heading level
|
73
|
-
heading = m[:text].strip
|
74
|
-
|
75
|
-
puts "heading #{heading_level} >#{heading}<" if debug?
|
76
|
-
outline << [:"h#{heading_level}", heading]
|
77
|
-
else ## assume it's a (plain/regular) text line
|
78
|
-
if start_para
|
79
|
-
outline << [:p, [line]]
|
80
|
-
start_para = false
|
81
|
-
else
|
82
|
-
node = outline[-1] ## get last entry
|
83
|
-
if node[0] == :p ## assert it's a p(aragraph) node!!!
|
84
|
-
node[1] << line ## add line to p(aragraph)
|
85
|
-
else
|
86
|
-
puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
|
87
|
-
pp node
|
88
|
-
exit 1
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
outline
|
94
|
-
end # method read
|
95
|
-
end # class OutlineReader
|
96
|
-
|
97
|
-
end # module SportDb
|