sportdb-parser 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +3 -0
- data/bin/fbtok +67 -0
- data/lib/sportdb/parser/linter.rb +149 -0
- data/lib/sportdb/parser/outline_reader.rb +97 -0
- data/lib/sportdb/parser/token-text.rb +50 -40
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +5 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ebb468318f2b87c33ca66afb6c46611ce5f420258e0c41b40a2cbfabcff7a49
|
4
|
+
data.tar.gz: 0cf1d511f3e936d73531442d1ca6bef94d90a50ae65346b5b57347d4d294dc77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e992ab97d7ae18c514de14078d30eb36adc40f5044242ce9ace089fb88b104c61b29ff86a2aa8101bb7257c3ff2ce32c6150439ff855e195bee1b26032bb0d9d
|
7
|
+
data.tar.gz: 25e66e45e7daf2783bc6507a3cb2c660d9153eab9530210ef51ef6e0d5d3fc531e5891897be3b0492b0ad7ea5fe3d406a0a3dd0559549b85518360d442ed4d8b
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -2,8 +2,11 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
+
bin/fbtok
|
5
6
|
lib/sportdb/parser.rb
|
6
7
|
lib/sportdb/parser/lang.rb
|
8
|
+
lib/sportdb/parser/linter.rb
|
9
|
+
lib/sportdb/parser/outline_reader.rb
|
7
10
|
lib/sportdb/parser/parser.rb
|
8
11
|
lib/sportdb/parser/token-date.rb
|
9
12
|
lib/sportdb/parser/token-score.rb
|
data/bin/fbtok
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
## tip: to test run:
|
4
|
+
## ruby -I ./lib bin/fbtok
|
5
|
+
|
6
|
+
require 'sportdb/parser'
|
7
|
+
|
8
|
+
|
9
|
+
require 'optparse' ## check - already auto-required in cocos? keep? why? why not?
|
10
|
+
|
11
|
+
|
12
|
+
args=ARGV
|
13
|
+
|
14
|
+
|
15
|
+
opts = {
|
16
|
+
debug: true,
|
17
|
+
metal: false,
|
18
|
+
}
|
19
|
+
|
20
|
+
parser = OptionParser.new do |parser|
|
21
|
+
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
22
|
+
|
23
|
+
parser.on( "--verbose", "--debug",
|
24
|
+
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
25
|
+
opts[:debug] = debug
|
26
|
+
end
|
27
|
+
|
28
|
+
parser.on( "--metal",
|
29
|
+
"turn off typed parse tree; show to the metal tokens"+
|
30
|
+
" (default: #{opts[:metal]})" ) do |metal|
|
31
|
+
opts[:metal] = metal
|
32
|
+
end
|
33
|
+
end
|
34
|
+
parser.parse!( args )
|
35
|
+
|
36
|
+
puts "OPTS:"
|
37
|
+
p opts
|
38
|
+
puts "ARGV:"
|
39
|
+
p args
|
40
|
+
|
41
|
+
|
42
|
+
SportDb::Parser::Linter.debug = true if opts[:debug]
|
43
|
+
|
44
|
+
linter = SportDb::Parser::Linter.new
|
45
|
+
errors = []
|
46
|
+
|
47
|
+
paths = args
|
48
|
+
paths.each_with_index do |path,i|
|
49
|
+
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
50
|
+
linter.read( path, parse: !opts[:metal] )
|
51
|
+
|
52
|
+
errors += linter.errors if linter.errors?
|
53
|
+
end
|
54
|
+
|
55
|
+
if errors.size > 0
|
56
|
+
puts
|
57
|
+
pp errors
|
58
|
+
puts
|
59
|
+
puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
60
|
+
else
|
61
|
+
puts
|
62
|
+
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
puts "bye"
|
67
|
+
|
@@ -0,0 +1,149 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
###
|
6
|
+
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
+
class Linter
|
8
|
+
|
9
|
+
def self.debug=(value) @@debug = value; end
|
10
|
+
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
+
def debug?() self.class.debug?; end
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
attr_reader :errors
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@errors = []
|
19
|
+
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def errors?() @errors.size > 0; end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
28
|
+
## make sure mon feb 12 18:10 will not match
|
29
|
+
## allow 1. FC Köln etc.
|
30
|
+
## Mainz 05:
|
31
|
+
## limit to 30 chars max
|
32
|
+
## only allow chars incl. intl buut (NOT ()[]/;)
|
33
|
+
##
|
34
|
+
## Group A:
|
35
|
+
## Group B: - remove colon
|
36
|
+
## or lookup first
|
37
|
+
|
38
|
+
ATTRIB_RE = %r{^
|
39
|
+
[ ]*? # slurp leading spaces
|
40
|
+
(?<key>[^:|\]\[()\/; -]
|
41
|
+
[^:|\]\[()\/;]{0,30}
|
42
|
+
)
|
43
|
+
[ ]*? # slurp trailing spaces
|
44
|
+
:[ ]+
|
45
|
+
(?<value>.+)
|
46
|
+
[ ]*? # slurp trailing spaces
|
47
|
+
$
|
48
|
+
}ix
|
49
|
+
|
50
|
+
|
51
|
+
#########
|
52
|
+
## parse - false (default) - tokenize (only)
|
53
|
+
## - true - tokenize & parse
|
54
|
+
def read( path, parse: false )
|
55
|
+
## note: every (new) read call - resets errors list to empty
|
56
|
+
@errors = []
|
57
|
+
|
58
|
+
nodes = OutlineReader.read( path )
|
59
|
+
|
60
|
+
## process nodes
|
61
|
+
h1 = nil
|
62
|
+
orphans = 0 ## track paragraphs's with no heading
|
63
|
+
|
64
|
+
attrib_found = false
|
65
|
+
|
66
|
+
|
67
|
+
nodes.each do |node|
|
68
|
+
type = node[0]
|
69
|
+
|
70
|
+
if type == :h1
|
71
|
+
h1 = node[1] ## get heading text
|
72
|
+
puts
|
73
|
+
puts " = Heading 1 >#{node[1]}<"
|
74
|
+
elsif type == :p
|
75
|
+
|
76
|
+
if h1.nil?
|
77
|
+
orphans += 1 ## only warn once
|
78
|
+
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
79
|
+
next
|
80
|
+
end
|
81
|
+
|
82
|
+
lines = node[1]
|
83
|
+
|
84
|
+
tree = []
|
85
|
+
lines.each_with_index do |line,i|
|
86
|
+
|
87
|
+
if debug?
|
88
|
+
puts
|
89
|
+
puts "line >#{line}<"
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
## skip new (experimental attrib syntax)
|
94
|
+
if attrib_found == false &&
|
95
|
+
ATTRIB_RE.match?( line )
|
96
|
+
## note: check attrib regex AFTER group def e.g.:
|
97
|
+
## Group A:
|
98
|
+
## Group B: etc.
|
99
|
+
## todo/fix - change Group A: to Group A etc.
|
100
|
+
## Group B: to Group B
|
101
|
+
attrib_found = true
|
102
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
103
|
+
next
|
104
|
+
end
|
105
|
+
|
106
|
+
if attrib_found
|
107
|
+
## check if line ends with dot
|
108
|
+
## if not slurp up lines to the next do!!!
|
109
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
110
|
+
attrib_found = false if line.end_with?( '.' )
|
111
|
+
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
112
|
+
next
|
113
|
+
end
|
114
|
+
|
115
|
+
t, error_messages = if parse
|
116
|
+
@parser.parse_with_errors( line )
|
117
|
+
else
|
118
|
+
@parser.tokenize_with_errors( line )
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
if error_messages.size > 0
|
123
|
+
## add to "global" error list
|
124
|
+
## make a triplet tuple (file / msg / line text)
|
125
|
+
error_messages.each do |msg|
|
126
|
+
@errors << [ path,
|
127
|
+
msg,
|
128
|
+
line
|
129
|
+
]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
pp t if debug?
|
134
|
+
|
135
|
+
tree << t
|
136
|
+
end
|
137
|
+
|
138
|
+
## pp tree
|
139
|
+
else
|
140
|
+
pp node
|
141
|
+
raise ArgumentError, "unsupported (node) type >#{type}<"
|
142
|
+
end
|
143
|
+
end # each node
|
144
|
+
end # read
|
145
|
+
end # class Linter
|
146
|
+
|
147
|
+
|
148
|
+
end # class Parser
|
149
|
+
end # module SportDb
|
@@ -0,0 +1,97 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
|
5
|
+
class OutlineReader
|
6
|
+
|
7
|
+
def self.debug=(value) @@debug = value; end
|
8
|
+
def self.debug?() @@debug ||= false; end
|
9
|
+
def debug?() self.class.debug?; end
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
+
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
15
|
+
parse( txt )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.parse( txt )
|
19
|
+
new( txt ).parse
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize( txt )
|
23
|
+
@txt = txt
|
24
|
+
end
|
25
|
+
|
26
|
+
## note: skip "decorative" only heading e.g. ========
|
27
|
+
## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
|
28
|
+
HEADING_BLANK_RE = %r{\A
|
29
|
+
={1,}
|
30
|
+
\z}x
|
31
|
+
|
32
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
33
|
+
HEADING_RE = %r{\A
|
34
|
+
(?<marker>={1,}) ## 1. leading ======
|
35
|
+
[ ]*
|
36
|
+
(?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
|
37
|
+
[ ]*
|
38
|
+
=* ## 3. (optional) trailing ====
|
39
|
+
\z}x
|
40
|
+
|
41
|
+
def parse
|
42
|
+
outline=[] ## outline structure
|
43
|
+
start_para = true ## start new para(graph) on new text line?
|
44
|
+
|
45
|
+
@txt.each_line do |line|
|
46
|
+
line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
|
47
|
+
|
48
|
+
if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
|
49
|
+
start_para = true
|
50
|
+
next
|
51
|
+
end
|
52
|
+
|
53
|
+
break if line == '__END__'
|
54
|
+
|
55
|
+
next if line.start_with?( '#' ) ## skip comments too
|
56
|
+
## strip inline (until end-of-line) comments too
|
57
|
+
## e.g Eupen | KAS Eupen ## [de]
|
58
|
+
## => Eupen | KAS Eupen
|
59
|
+
## e.g bq Bonaire, BOE # CONCACAF
|
60
|
+
## => bq Bonaire, BOE
|
61
|
+
line = line.sub( /#.*/, '' ).strip
|
62
|
+
pp line if debug?
|
63
|
+
|
64
|
+
## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
|
65
|
+
next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
|
66
|
+
|
67
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
68
|
+
if m=HEADING_RE.match( line )
|
69
|
+
start_para = true
|
70
|
+
|
71
|
+
heading_marker = m[:marker]
|
72
|
+
heading_level = heading_marker.length ## count number of = for heading level
|
73
|
+
heading = m[:text].strip
|
74
|
+
|
75
|
+
puts "heading #{heading_level} >#{heading}<" if debug?
|
76
|
+
outline << [:"h#{heading_level}", heading]
|
77
|
+
else ## assume it's a (plain/regular) text line
|
78
|
+
if start_para
|
79
|
+
outline << [:p, [line]]
|
80
|
+
start_para = false
|
81
|
+
else
|
82
|
+
node = outline[-1] ## get last entry
|
83
|
+
if node[0] == :p ## assert it's a p(aragraph) node!!!
|
84
|
+
node[1] << line ## add line to p(aragraph)
|
85
|
+
else
|
86
|
+
puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
|
87
|
+
pp node
|
88
|
+
exit 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
outline
|
94
|
+
end # method read
|
95
|
+
end # class OutlineReader
|
96
|
+
|
97
|
+
end # module SportDb
|
@@ -1,12 +1,12 @@
|
|
1
|
-
module SportDb
|
1
|
+
module SportDb
|
2
2
|
class Parser
|
3
|
-
|
4
|
-
|
3
|
+
|
4
|
+
|
5
5
|
## note - do NOT allow single alpha text for now
|
6
|
-
## add later?? A - B C - D - why?
|
6
|
+
## add later?? A - B C - D - why?
|
7
7
|
## opt 1) one alpha
|
8
|
-
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
-
|
8
|
+
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
+
|
10
10
|
## opt 2) more than one alphanum
|
11
11
|
|
12
12
|
|
@@ -26,19 +26,19 @@ class Parser
|
|
26
26
|
|
27
27
|
|
28
28
|
TEXT_RE = %r{
|
29
|
-
## must start with alpha (allow unicode letters!!)
|
30
|
-
(?<text>
|
31
|
-
## positive lookbehind
|
29
|
+
## must start with alpha (allow unicode letters!!)
|
30
|
+
(?<text>
|
31
|
+
## positive lookbehind
|
32
32
|
## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
|
33
33
|
(?<=[ ,;@|\[\]]
|
34
34
|
|^
|
35
35
|
)
|
36
|
-
(?:
|
36
|
+
(?:
|
37
37
|
# opt 1 - start with alpha
|
38
38
|
\p{L}+ ## all unicode letters (e.g. [a-z])
|
39
39
|
|
|
40
40
|
|
41
|
-
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
41
|
+
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
42
42
|
\d+ # check for num lookahead (MUST be space or dot)
|
43
43
|
## MUST be followed by (optional dot) and
|
44
44
|
## required space !!!
|
@@ -46,69 +46,79 @@ TEXT_RE = %r{
|
|
46
46
|
\.? ## optional dot
|
47
47
|
[ ]? ## make space optional too - why? why not?
|
48
48
|
## yes - eg. 1st, 2nd, 5th etc.
|
49
|
-
\p{L}+
|
49
|
+
\p{L}+
|
50
50
|
)
|
51
|
-
|
51
|
+
|
52
52
|
(?:(?: (?:[ ]
|
53
53
|
(?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
|
54
|
-
)
|
54
|
+
)
|
55
55
|
| # only single spaces allowed inline!!!
|
56
|
-
[-]
|
56
|
+
[-]
|
57
57
|
)?
|
58
58
|
(?:
|
59
59
|
\p{L} |
|
60
|
-
[&/']
|
60
|
+
[&/']
|
61
61
|
|
|
62
62
|
(?:
|
63
|
-
\d+
|
64
|
-
(?![0-9.:h'/+-])
|
63
|
+
\d+
|
64
|
+
(?![0-9.:h'/+-])
|
65
65
|
## negative lookahead for numbers
|
66
66
|
## note - include digits itself!!!
|
67
|
-
)|
|
68
|
-
\.
|
69
|
-
)
|
67
|
+
)|
|
68
|
+
\.
|
69
|
+
)
|
70
70
|
)* ## must NOT end with space or dash(-)
|
71
71
|
## todo/fix - possible in regex here
|
72
72
|
## only end in alphanum a-z0-9 (not dot or & ???)
|
73
73
|
|
74
|
-
|
74
|
+
|
75
75
|
## allow optional at the end
|
76
76
|
## tag or year
|
77
|
-
## make it and in the future - why? why not?
|
78
|
-
##
|
77
|
+
## make it and in the future - why? why not?
|
78
|
+
##
|
79
|
+
## change - fix
|
80
|
+
## do NOT use (A) for amateur
|
81
|
+
## use A or A. with NO ()!!!
|
79
82
|
## (A) - allow with predined alpha only for now
|
80
83
|
## e.g. (A) - amateur a team or b?
|
84
|
+
### same for U21 or U9 etc
|
85
|
+
## use with NO ()!!! - why? why not?
|
81
86
|
## or U21 U9 etc. - why? why not?
|
82
87
|
## or etc.
|
83
88
|
## (1879-1893) or allow years e.g. (1879-1893)
|
84
|
-
###
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
###
|
90
|
+
## add allow country code three to five letters for now
|
91
|
+
## change to generic 1 to 5 - why? why not?
|
92
|
+
## e.g. (A), (I),
|
93
|
+
## (AUT)
|
94
|
+
## (TRNC) five? for UEFA code for northern cyprus
|
95
|
+
## change to 1 to 4 - why? why not?
|
96
|
+
## check - fix possible for upper case only here
|
97
|
+
## inline for this group only?
|
93
98
|
(?:
|
94
|
-
[ ]
|
99
|
+
[ ]
|
95
100
|
\(
|
96
101
|
\d{4}-\d{4}
|
97
102
|
\)
|
98
|
-
)?
|
99
|
-
|
103
|
+
)?
|
104
|
+
(?:
|
105
|
+
[ ]+ ## allow more than once space - why? why not?
|
106
|
+
\( (?:
|
107
|
+
[A-Z]{1,5}
|
108
|
+
)
|
109
|
+
\)
|
110
|
+
)?
|
100
111
|
## add lookahead/lookbehind
|
101
|
-
## must be space!!!
|
112
|
+
## must be space!!!
|
102
113
|
## (or comma or start/end of string)
|
103
114
|
## kind of \b !!!
|
104
115
|
## positive lookahead
|
105
116
|
(?=[ ,;@|\[\]]
|
106
117
|
|$
|
107
118
|
)
|
108
|
-
)
|
119
|
+
)
|
109
120
|
}ix
|
110
121
|
|
111
122
|
|
112
123
|
end # class Parser
|
113
|
-
end # module SportDb
|
114
|
-
|
124
|
+
end # module SportDb
|
data/lib/sportdb/parser.rb
CHANGED
@@ -24,6 +24,11 @@ require_relative 'parser/lang'
|
|
24
24
|
require_relative 'parser/parser'
|
25
25
|
|
26
26
|
|
27
|
+
####
|
28
|
+
## todo/check - move outline reader upstream to cocos - why? why not?
|
29
|
+
## use read_outline(), parse_outline() - why? why not?
|
30
|
+
require_relative 'parser/outline_reader'
|
31
|
+
require_relative 'parser/linter'
|
27
32
|
|
28
33
|
###
|
29
34
|
# make parser api (easily) available - why? why not?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -74,7 +74,8 @@ dependencies:
|
|
74
74
|
version: '4.1'
|
75
75
|
description: sportdb-parser - football.txt match parser (& tokenizer)
|
76
76
|
email: gerald.bauer@gmail.com
|
77
|
-
executables:
|
77
|
+
executables:
|
78
|
+
- fbtok
|
78
79
|
extensions: []
|
79
80
|
extra_rdoc_files:
|
80
81
|
- CHANGELOG.md
|
@@ -85,8 +86,11 @@ files:
|
|
85
86
|
- Manifest.txt
|
86
87
|
- README.md
|
87
88
|
- Rakefile
|
89
|
+
- bin/fbtok
|
88
90
|
- lib/sportdb/parser.rb
|
89
91
|
- lib/sportdb/parser/lang.rb
|
92
|
+
- lib/sportdb/parser/linter.rb
|
93
|
+
- lib/sportdb/parser/outline_reader.rb
|
90
94
|
- lib/sportdb/parser/parser.rb
|
91
95
|
- lib/sportdb/parser/token-date.rb
|
92
96
|
- lib/sportdb/parser/token-score.rb
|