sportdb-parser 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +3 -0
- data/bin/fbtok +67 -0
- data/lib/sportdb/parser/linter.rb +149 -0
- data/lib/sportdb/parser/outline_reader.rb +97 -0
- data/lib/sportdb/parser/token-text.rb +50 -40
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +5 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ebb468318f2b87c33ca66afb6c46611ce5f420258e0c41b40a2cbfabcff7a49
|
4
|
+
data.tar.gz: 0cf1d511f3e936d73531442d1ca6bef94d90a50ae65346b5b57347d4d294dc77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e992ab97d7ae18c514de14078d30eb36adc40f5044242ce9ace089fb88b104c61b29ff86a2aa8101bb7257c3ff2ce32c6150439ff855e195bee1b26032bb0d9d
|
7
|
+
data.tar.gz: 25e66e45e7daf2783bc6507a3cb2c660d9153eab9530210ef51ef6e0d5d3fc531e5891897be3b0492b0ad7ea5fe3d406a0a3dd0559549b85518360d442ed4d8b
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -2,8 +2,11 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
+
bin/fbtok
|
5
6
|
lib/sportdb/parser.rb
|
6
7
|
lib/sportdb/parser/lang.rb
|
8
|
+
lib/sportdb/parser/linter.rb
|
9
|
+
lib/sportdb/parser/outline_reader.rb
|
7
10
|
lib/sportdb/parser/parser.rb
|
8
11
|
lib/sportdb/parser/token-date.rb
|
9
12
|
lib/sportdb/parser/token-score.rb
|
data/bin/fbtok
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
## tip: to test run:
|
4
|
+
## ruby -I ./lib bin/fbtok
|
5
|
+
|
6
|
+
require 'sportdb/parser'
|
7
|
+
|
8
|
+
|
9
|
+
require 'optparse' ## check - already auto-required in cocos? keep? why? why not?
|
10
|
+
|
11
|
+
|
12
|
+
args=ARGV
|
13
|
+
|
14
|
+
|
15
|
+
opts = {
|
16
|
+
debug: true,
|
17
|
+
metal: false,
|
18
|
+
}
|
19
|
+
|
20
|
+
parser = OptionParser.new do |parser|
|
21
|
+
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
22
|
+
|
23
|
+
parser.on( "--verbose", "--debug",
|
24
|
+
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
25
|
+
opts[:debug] = debug
|
26
|
+
end
|
27
|
+
|
28
|
+
parser.on( "--metal",
|
29
|
+
"turn off typed parse tree; show to the metal tokens"+
|
30
|
+
" (default: #{opts[:metal]})" ) do |metal|
|
31
|
+
opts[:metal] = metal
|
32
|
+
end
|
33
|
+
end
|
34
|
+
parser.parse!( args )
|
35
|
+
|
36
|
+
puts "OPTS:"
|
37
|
+
p opts
|
38
|
+
puts "ARGV:"
|
39
|
+
p args
|
40
|
+
|
41
|
+
|
42
|
+
SportDb::Parser::Linter.debug = true if opts[:debug]
|
43
|
+
|
44
|
+
linter = SportDb::Parser::Linter.new
|
45
|
+
errors = []
|
46
|
+
|
47
|
+
paths = args
|
48
|
+
paths.each_with_index do |path,i|
|
49
|
+
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
50
|
+
linter.read( path, parse: !opts[:metal] )
|
51
|
+
|
52
|
+
errors += linter.errors if linter.errors?
|
53
|
+
end
|
54
|
+
|
55
|
+
if errors.size > 0
|
56
|
+
puts
|
57
|
+
pp errors
|
58
|
+
puts
|
59
|
+
puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
60
|
+
else
|
61
|
+
puts
|
62
|
+
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
puts "bye"
|
67
|
+
|
@@ -0,0 +1,149 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
###
|
6
|
+
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
+
class Linter
|
8
|
+
|
9
|
+
def self.debug=(value) @@debug = value; end
|
10
|
+
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
+
def debug?() self.class.debug?; end
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
attr_reader :errors
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@errors = []
|
19
|
+
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def errors?() @errors.size > 0; end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
## note: colon (:) MUST be followed by one (or more) spaces
|
28
|
+
## make sure mon feb 12 18:10 will not match
|
29
|
+
## allow 1. FC Köln etc.
|
30
|
+
## Mainz 05:
|
31
|
+
## limit to 30 chars max
|
32
|
+
## only allow chars incl. intl buut (NOT ()[]/;)
|
33
|
+
##
|
34
|
+
## Group A:
|
35
|
+
## Group B: - remove colon
|
36
|
+
## or lookup first
|
37
|
+
|
38
|
+
ATTRIB_RE = %r{^
|
39
|
+
[ ]*? # slurp leading spaces
|
40
|
+
(?<key>[^:|\]\[()\/; -]
|
41
|
+
[^:|\]\[()\/;]{0,30}
|
42
|
+
)
|
43
|
+
[ ]*? # slurp trailing spaces
|
44
|
+
:[ ]+
|
45
|
+
(?<value>.+)
|
46
|
+
[ ]*? # slurp trailing spaces
|
47
|
+
$
|
48
|
+
}ix
|
49
|
+
|
50
|
+
|
51
|
+
#########
|
52
|
+
## parse - false (default) - tokenize (only)
|
53
|
+
## - true - tokenize & parse
|
54
|
+
def read( path, parse: false )
|
55
|
+
## note: every (new) read call - resets errors list to empty
|
56
|
+
@errors = []
|
57
|
+
|
58
|
+
nodes = OutlineReader.read( path )
|
59
|
+
|
60
|
+
## process nodes
|
61
|
+
h1 = nil
|
62
|
+
orphans = 0 ## track paragraphs's with no heading
|
63
|
+
|
64
|
+
attrib_found = false
|
65
|
+
|
66
|
+
|
67
|
+
nodes.each do |node|
|
68
|
+
type = node[0]
|
69
|
+
|
70
|
+
if type == :h1
|
71
|
+
h1 = node[1] ## get heading text
|
72
|
+
puts
|
73
|
+
puts " = Heading 1 >#{node[1]}<"
|
74
|
+
elsif type == :p
|
75
|
+
|
76
|
+
if h1.nil?
|
77
|
+
orphans += 1 ## only warn once
|
78
|
+
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
79
|
+
next
|
80
|
+
end
|
81
|
+
|
82
|
+
lines = node[1]
|
83
|
+
|
84
|
+
tree = []
|
85
|
+
lines.each_with_index do |line,i|
|
86
|
+
|
87
|
+
if debug?
|
88
|
+
puts
|
89
|
+
puts "line >#{line}<"
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
## skip new (experimental attrib syntax)
|
94
|
+
if attrib_found == false &&
|
95
|
+
ATTRIB_RE.match?( line )
|
96
|
+
## note: check attrib regex AFTER group def e.g.:
|
97
|
+
## Group A:
|
98
|
+
## Group B: etc.
|
99
|
+
## todo/fix - change Group A: to Group A etc.
|
100
|
+
## Group B: to Group B
|
101
|
+
attrib_found = true
|
102
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
103
|
+
next
|
104
|
+
end
|
105
|
+
|
106
|
+
if attrib_found
|
107
|
+
## check if line ends with dot
|
108
|
+
## if not slurp up lines to the next do!!!
|
109
|
+
## logger.debug "skipping key/value line - >#{line}<"
|
110
|
+
attrib_found = false if line.end_with?( '.' )
|
111
|
+
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
112
|
+
next
|
113
|
+
end
|
114
|
+
|
115
|
+
t, error_messages = if parse
|
116
|
+
@parser.parse_with_errors( line )
|
117
|
+
else
|
118
|
+
@parser.tokenize_with_errors( line )
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
if error_messages.size > 0
|
123
|
+
## add to "global" error list
|
124
|
+
## make a triplet tuple (file / msg / line text)
|
125
|
+
error_messages.each do |msg|
|
126
|
+
@errors << [ path,
|
127
|
+
msg,
|
128
|
+
line
|
129
|
+
]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
pp t if debug?
|
134
|
+
|
135
|
+
tree << t
|
136
|
+
end
|
137
|
+
|
138
|
+
## pp tree
|
139
|
+
else
|
140
|
+
pp node
|
141
|
+
raise ArgumentError, "unsupported (node) type >#{type}<"
|
142
|
+
end
|
143
|
+
end # each node
|
144
|
+
end # read
|
145
|
+
end # class Linter
|
146
|
+
|
147
|
+
|
148
|
+
end # class Parser
|
149
|
+
end # module SportDb
|
@@ -0,0 +1,97 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
|
5
|
+
class OutlineReader
|
6
|
+
|
7
|
+
def self.debug=(value) @@debug = value; end
|
8
|
+
def self.debug?() @@debug ||= false; end
|
9
|
+
def debug?() self.class.debug?; end
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
+
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
15
|
+
parse( txt )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.parse( txt )
|
19
|
+
new( txt ).parse
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize( txt )
|
23
|
+
@txt = txt
|
24
|
+
end
|
25
|
+
|
26
|
+
## note: skip "decorative" only heading e.g. ========
|
27
|
+
## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
|
28
|
+
HEADING_BLANK_RE = %r{\A
|
29
|
+
={1,}
|
30
|
+
\z}x
|
31
|
+
|
32
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
33
|
+
HEADING_RE = %r{\A
|
34
|
+
(?<marker>={1,}) ## 1. leading ======
|
35
|
+
[ ]*
|
36
|
+
(?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
|
37
|
+
[ ]*
|
38
|
+
=* ## 3. (optional) trailing ====
|
39
|
+
\z}x
|
40
|
+
|
41
|
+
def parse
|
42
|
+
outline=[] ## outline structure
|
43
|
+
start_para = true ## start new para(graph) on new text line?
|
44
|
+
|
45
|
+
@txt.each_line do |line|
|
46
|
+
line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
|
47
|
+
|
48
|
+
if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
|
49
|
+
start_para = true
|
50
|
+
next
|
51
|
+
end
|
52
|
+
|
53
|
+
break if line == '__END__'
|
54
|
+
|
55
|
+
next if line.start_with?( '#' ) ## skip comments too
|
56
|
+
## strip inline (until end-of-line) comments too
|
57
|
+
## e.g Eupen | KAS Eupen ## [de]
|
58
|
+
## => Eupen | KAS Eupen
|
59
|
+
## e.g bq Bonaire, BOE # CONCACAF
|
60
|
+
## => bq Bonaire, BOE
|
61
|
+
line = line.sub( /#.*/, '' ).strip
|
62
|
+
pp line if debug?
|
63
|
+
|
64
|
+
## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
|
65
|
+
next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
|
66
|
+
|
67
|
+
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
68
|
+
if m=HEADING_RE.match( line )
|
69
|
+
start_para = true
|
70
|
+
|
71
|
+
heading_marker = m[:marker]
|
72
|
+
heading_level = heading_marker.length ## count number of = for heading level
|
73
|
+
heading = m[:text].strip
|
74
|
+
|
75
|
+
puts "heading #{heading_level} >#{heading}<" if debug?
|
76
|
+
outline << [:"h#{heading_level}", heading]
|
77
|
+
else ## assume it's a (plain/regular) text line
|
78
|
+
if start_para
|
79
|
+
outline << [:p, [line]]
|
80
|
+
start_para = false
|
81
|
+
else
|
82
|
+
node = outline[-1] ## get last entry
|
83
|
+
if node[0] == :p ## assert it's a p(aragraph) node!!!
|
84
|
+
node[1] << line ## add line to p(aragraph)
|
85
|
+
else
|
86
|
+
puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
|
87
|
+
pp node
|
88
|
+
exit 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
outline
|
94
|
+
end # method read
|
95
|
+
end # class OutlineReader
|
96
|
+
|
97
|
+
end # module SportDb
|
@@ -1,12 +1,12 @@
|
|
1
|
-
module SportDb
|
1
|
+
module SportDb
|
2
2
|
class Parser
|
3
|
-
|
4
|
-
|
3
|
+
|
4
|
+
|
5
5
|
## note - do NOT allow single alpha text for now
|
6
|
-
## add later?? A - B C - D - why?
|
6
|
+
## add later?? A - B C - D - why?
|
7
7
|
## opt 1) one alpha
|
8
|
-
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
-
|
8
|
+
## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
|
9
|
+
|
10
10
|
## opt 2) more than one alphanum
|
11
11
|
|
12
12
|
|
@@ -26,19 +26,19 @@ class Parser
|
|
26
26
|
|
27
27
|
|
28
28
|
TEXT_RE = %r{
|
29
|
-
## must start with alpha (allow unicode letters!!)
|
30
|
-
(?<text>
|
31
|
-
## positive lookbehind
|
29
|
+
## must start with alpha (allow unicode letters!!)
|
30
|
+
(?<text>
|
31
|
+
## positive lookbehind
|
32
32
|
## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
|
33
33
|
(?<=[ ,;@|\[\]]
|
34
34
|
|^
|
35
35
|
)
|
36
|
-
(?:
|
36
|
+
(?:
|
37
37
|
# opt 1 - start with alpha
|
38
38
|
\p{L}+ ## all unicode letters (e.g. [a-z])
|
39
39
|
|
|
40
40
|
|
41
|
-
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
41
|
+
# opt 2 - start with num!! - allow special case (e.g. 1. FC)
|
42
42
|
\d+ # check for num lookahead (MUST be space or dot)
|
43
43
|
## MUST be followed by (optional dot) and
|
44
44
|
## required space !!!
|
@@ -46,69 +46,79 @@ TEXT_RE = %r{
|
|
46
46
|
\.? ## optional dot
|
47
47
|
[ ]? ## make space optional too - why? why not?
|
48
48
|
## yes - eg. 1st, 2nd, 5th etc.
|
49
|
-
\p{L}+
|
49
|
+
\p{L}+
|
50
50
|
)
|
51
|
-
|
51
|
+
|
52
52
|
(?:(?: (?:[ ]
|
53
53
|
(?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
|
54
|
-
)
|
54
|
+
)
|
55
55
|
| # only single spaces allowed inline!!!
|
56
|
-
[-]
|
56
|
+
[-]
|
57
57
|
)?
|
58
58
|
(?:
|
59
59
|
\p{L} |
|
60
|
-
[&/']
|
60
|
+
[&/']
|
61
61
|
|
|
62
62
|
(?:
|
63
|
-
\d+
|
64
|
-
(?![0-9.:h'/+-])
|
63
|
+
\d+
|
64
|
+
(?![0-9.:h'/+-])
|
65
65
|
## negative lookahead for numbers
|
66
66
|
## note - include digits itself!!!
|
67
|
-
)|
|
68
|
-
\.
|
69
|
-
)
|
67
|
+
)|
|
68
|
+
\.
|
69
|
+
)
|
70
70
|
)* ## must NOT end with space or dash(-)
|
71
71
|
## todo/fix - possible in regex here
|
72
72
|
## only end in alphanum a-z0-9 (not dot or & ???)
|
73
73
|
|
74
|
-
|
74
|
+
|
75
75
|
## allow optional at the end
|
76
76
|
## tag or year
|
77
|
-
## make it and in the future - why? why not?
|
78
|
-
##
|
77
|
+
## make it and in the future - why? why not?
|
78
|
+
##
|
79
|
+
## change - fix
|
80
|
+
## do NOT use (A) for amateur
|
81
|
+
## use A or A. with NO ()!!!
|
79
82
|
## (A) - allow with predined alpha only for now
|
80
83
|
## e.g. (A) - amateur a team or b?
|
84
|
+
### same for U21 or U9 etc
|
85
|
+
## use with NO ()!!! - why? why not?
|
81
86
|
## or U21 U9 etc. - why? why not?
|
82
87
|
## or etc.
|
83
88
|
## (1879-1893) or allow years e.g. (1879-1893)
|
84
|
-
###
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
###
|
90
|
+
## add allow country code three to five letters for now
|
91
|
+
## change to generic 1 to 5 - why? why not?
|
92
|
+
## e.g. (A), (I),
|
93
|
+
## (AUT)
|
94
|
+
## (TRNC) five? for UEFA code for northern cyprus
|
95
|
+
## change to 1 to 4 - why? why not?
|
96
|
+
## check - fix possible for upper case only here
|
97
|
+
## inline for this group only?
|
93
98
|
(?:
|
94
|
-
[ ]
|
99
|
+
[ ]
|
95
100
|
\(
|
96
101
|
\d{4}-\d{4}
|
97
102
|
\)
|
98
|
-
)?
|
99
|
-
|
103
|
+
)?
|
104
|
+
(?:
|
105
|
+
[ ]+ ## allow more than once space - why? why not?
|
106
|
+
\( (?:
|
107
|
+
[A-Z]{1,5}
|
108
|
+
)
|
109
|
+
\)
|
110
|
+
)?
|
100
111
|
## add lookahead/lookbehind
|
101
|
-
## must be space!!!
|
112
|
+
## must be space!!!
|
102
113
|
## (or comma or start/end of string)
|
103
114
|
## kind of \b !!!
|
104
115
|
## positive lookahead
|
105
116
|
(?=[ ,;@|\[\]]
|
106
117
|
|$
|
107
118
|
)
|
108
|
-
)
|
119
|
+
)
|
109
120
|
}ix
|
110
121
|
|
111
122
|
|
112
123
|
end # class Parser
|
113
|
-
end # module SportDb
|
114
|
-
|
124
|
+
end # module SportDb
|
data/lib/sportdb/parser.rb
CHANGED
@@ -24,6 +24,11 @@ require_relative 'parser/lang'
|
|
24
24
|
require_relative 'parser/parser'
|
25
25
|
|
26
26
|
|
27
|
+
####
|
28
|
+
## todo/check - move outline reader upstream to cocos - why? why not?
|
29
|
+
## use read_outline(), parse_outline() - why? why not?
|
30
|
+
require_relative 'parser/outline_reader'
|
31
|
+
require_relative 'parser/linter'
|
27
32
|
|
28
33
|
###
|
29
34
|
# make parser api (easily) available - why? why not?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -74,7 +74,8 @@ dependencies:
|
|
74
74
|
version: '4.1'
|
75
75
|
description: sportdb-parser - football.txt match parser (& tokenizer)
|
76
76
|
email: gerald.bauer@gmail.com
|
77
|
-
executables:
|
77
|
+
executables:
|
78
|
+
- fbtok
|
78
79
|
extensions: []
|
79
80
|
extra_rdoc_files:
|
80
81
|
- CHANGELOG.md
|
@@ -85,8 +86,11 @@ files:
|
|
85
86
|
- Manifest.txt
|
86
87
|
- README.md
|
87
88
|
- Rakefile
|
89
|
+
- bin/fbtok
|
88
90
|
- lib/sportdb/parser.rb
|
89
91
|
- lib/sportdb/parser/lang.rb
|
92
|
+
- lib/sportdb/parser/linter.rb
|
93
|
+
- lib/sportdb/parser/outline_reader.rb
|
90
94
|
- lib/sportdb/parser/parser.rb
|
91
95
|
- lib/sportdb/parser/token-date.rb
|
92
96
|
- lib/sportdb/parser/token-score.rb
|