rsssf-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1b6cfe7842f0f46d242c1c2fc1f52b4c032b5c25fce314939583c7f96a486c65
4
+ data.tar.gz: ba5244b284f65129dca3b35e87d10984e1bf8906e571b3e42e85a4615eecb733
5
+ SHA512:
6
+ metadata.gz: 9a63d121c858e35f757b59c490fc05cfd1457ac5c6e3294a291db787da141061f046c5ce2342fdf275e64bbb647934ef43d8547c2aa53eef537d12405d746185
7
+ data.tar.gz: e7a8f41d2d53e63fb72e35f22b1a0a0da370e15ab1b4aedf2ae1f37a2a1bc277ff31deb8362e541bbc403f7d49ea3f1120b4b271b9d36982359141ca80617d72
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ### 0.0.1 / 2024-07-17
2
+
3
+ * Everything is new. First release.
data/Manifest.txt ADDED
@@ -0,0 +1,15 @@
1
+ CHANGELOG.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ bin/rsssf
6
+ lib/rsssf/parser.rb
7
+ lib/rsssf/parser/linter.rb
8
+ lib/rsssf/parser/parser.rb
9
+ lib/rsssf/parser/token-date.rb
10
+ lib/rsssf/parser/token-goals.rb
11
+ lib/rsssf/parser/token-note.rb
12
+ lib/rsssf/parser/token-round.rb
13
+ lib/rsssf/parser/token-score.rb
14
+ lib/rsssf/parser/token-text.rb
15
+ lib/rsssf/parser/token.rb
data/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'hoe'
2
+
3
+
4
+ Hoe.spec 'rsssf-parser' do
5
+
6
+ self.version = '0.0.1'
7
+
8
+ self.summary = "rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions"
9
+ self.description = summary
10
+
11
+ self.urls = { home: 'https://github.com/sportdb/sport.db' }
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'gerald.bauer@gmail.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'CHANGELOG.md'
19
+
20
+ self.licenses = ['Public Domain']
21
+
22
+ self.extra_deps = [
23
+ ['sportdb-parser'], ### (re(use standard football.txt parser machinery - why? why not?
24
+ ]
25
+
26
+ self.spec_extras = {
27
+ required_ruby_version: '>= 2.2.2'
28
+ }
29
+ end
data/bin/rsssf ADDED
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ## tip: to test run:
4
+ ## ruby -I ./lib -I ../parser/lib bin/rsssf
5
+
6
+ require 'rsssf/parser'
7
+
8
+
9
+ require 'optparse'
10
+
11
+
12
+ args = ARGV
13
+ opts = { debug: false,
14
+ metal: false }
15
+
16
+ parser = OptionParser.new do |parser|
17
+ parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
18
+
19
+ ##
20
+ ## check if git has a offline option?? (use same)
21
+ ## check for other tools - why? why not?
22
+
23
+
24
+ parser.on( "--verbose", "--debug",
25
+ "turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
26
+ opts[:debug] = debug
27
+ end
28
+
29
+ parser.on( "--metal",
30
+ "turn off typed parse tree; show to the metal tokens"+
31
+ " (default: #{opts[:metal]})" ) do |metal|
32
+ opts[:metal] = metal
33
+ end
34
+ end
35
+ parser.parse!( args )
36
+
37
+ puts "OPTS:"
38
+ p opts
39
+ puts "ARGV:"
40
+ p args
41
+
42
+
43
+ paths = if args.empty?
44
+ [
45
+ '../../../rsssf/austria/2010-11/cup.txt',
46
+ ]
47
+ else
48
+ ## todo/fix - expand_args!!!
49
+ args
50
+ end
51
+
52
+
53
+ pp paths
54
+
55
+
56
+
57
+
58
+ Rsssf::Parser::Linter.debug = true if opts[:debug]
59
+
60
+ linter = Rsssf::Parser::Linter.new
61
+
62
+
63
+ paths.each_with_index do |path,i|
64
+ puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
65
+ linter.read( path, parse: !opts[:metal] )
66
+ end
67
+
68
+ if linter.errors?
69
+ puts
70
+ pp linter.errors
71
+ puts "!! #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
72
+ else
73
+ puts "OK no parse errors found in #{paths.size} datafile(s)"
74
+ end
75
+
76
+ puts "bye"
77
+
78
+
79
+
80
+
@@ -0,0 +1,84 @@
1
+
2
+ module Rsssf
3
+ class Parser
4
+
5
+ ###
6
+ ## note - Linter for now nested inside Parser - keep? why? why not?
7
+ class Linter
8
+
9
+ def self.debug=(value) @@debug = value; end
10
+ def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
+ def debug?() self.class.debug?; end
12
+
13
+
14
+
15
+
16
+ attr_reader :errors
17
+
18
+ def initialize
19
+ @errors = []
20
+ @parser = Parser.new ## use own parser instance (not shared) - why? why not?
21
+ end
22
+
23
+
24
+ def errors?() @errors.size > 0; end
25
+
26
+
27
+
28
+ #########
29
+ ## parse - false (default) - tokenize (only)
30
+ ## - true - tokenize & parse
31
+ def read( path, parse: false )
32
+
33
+ ## fix - (re)use outline reader later!!!
34
+ ## plus check for headings etc.
35
+
36
+ text = File.open( path, 'r:utf-8' ) { |f| f.read }
37
+ lines = text.split( "\n" )
38
+
39
+
40
+ ## process lines
41
+ tree = []
42
+ lines.each do |line|
43
+
44
+ ## skip blank and comment lines
45
+ next if line.strip.empty? || line.strip.start_with?('#')
46
+
47
+ ## strip inline (end-of-line) comments
48
+ line = line.sub( /#.+$/, '' )
49
+
50
+
51
+ if debug?
52
+ puts
53
+ puts "line >#{line}<"
54
+ end
55
+
56
+ t, error_messages = if parse
57
+ @parser.parse_with_errors( line )
58
+ else
59
+ @parser.tokenize_with_errors( line )
60
+ end
61
+
62
+
63
+ if error_messages.size > 0
64
+ ## add to "global" error list
65
+ ## make a triplet tuple (file / msg / line text)
66
+ error_messages.each do |msg|
67
+ @errors << [ path,
68
+ msg,
69
+ line
70
+ ]
71
+ end
72
+ end
73
+
74
+ pp t if debug?
75
+
76
+ tree << t
77
+ end
78
+ ## pp tree
79
+ end # read
80
+ end # class Linter
81
+
82
+
83
+ end # class Parser
84
+ end # module Rsssf
@@ -0,0 +1,100 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ## transforms
6
+ ##
7
+ ## Netherlands 1-2 (1-1) England
8
+ ## => text => team
9
+ ## score|vs
10
+ ## text => team
11
+
12
+
13
+
14
+ ##
15
+ ##
16
+ ## add !!!!
17
+ ## collect_until e.g. collect_until( :text )
18
+
19
+
20
+ def parse_with_errors( line, debug: false )
21
+ errors = []
22
+ tokens, token_errors = tokenize_with_errors( line )
23
+ errors += token_errors
24
+
25
+
26
+ =begin
27
+ #############
28
+ ## pass 1
29
+ ## replace all texts with keyword matches (e.g. group, round, leg, etc.)
30
+ tokens = tokens.map do |t|
31
+ if t[0] == :text
32
+ text = t[1]
33
+ if is_group?( text )
34
+ ### expects to be followed by num (or text ABC??)
35
+ [:group, text]
36
+ elsif is_matchday?( text )
37
+ ### expects to be followed by num
38
+ ## use different name e.g. :fix_round or such?
39
+ [:matchday, text]
40
+ elsif is_leg?( text )
41
+ [:leg, text]
42
+ elsif is_round?( text )
43
+ [:round, text]
44
+ else
45
+ t ## pass through as-is (1:1)
46
+ end
47
+ else
48
+ t
49
+ end
50
+ end
51
+
52
+
53
+ ## puts "tokens:"
54
+ ## pp tokens
55
+ =end
56
+
57
+ ## transform tokens into (parse tree/ast) nodes
58
+ nodes = []
59
+
60
+ ## note - (re)use token buffer from "standard" parser here !!!!
61
+ buf = SportDb::Parser::Tokens.new( tokens )
62
+ ## pp buf
63
+
64
+
65
+ loop do
66
+ if buf.match?( :text, [:score,
67
+ :score_awd,
68
+ :score_abd,
69
+ :score_ppd,
70
+ :score_np,
71
+ :score_wo,
72
+ :vs], :text )
73
+ nodes << [:team, buf.next[1]]
74
+ nodes << buf.next
75
+ nodes << [:team, buf.next[1]]
76
+ elsif buf.match?( :text, :minute ) ## assume player+minute
77
+ nodes << [:player, buf.next[1]]
78
+ nodes << buf.next
79
+ else
80
+ ## pass through
81
+ nodes << buf.next
82
+ end
83
+
84
+ break if buf.eos?
85
+ end
86
+
87
+ [nodes,errors]
88
+ end
89
+
90
+
91
+ ### convience helper - ignore errors by default
92
+ def parse( line, debug: false )
93
+ nodes, _ = parse_with_errors( line, debug: debug )
94
+ nodes
95
+ end
96
+
97
+
98
+ end # class Parser
99
+ end # module Rsssf
100
+
@@ -0,0 +1,161 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+
6
+ def self.parse_names( txt )
7
+ lines = [] # array of lines (with words)
8
+
9
+ txt.each_line do |line|
10
+ line = line.strip
11
+
12
+ next if line.empty?
13
+ next if line.start_with?( '#' ) ## skip comments too
14
+
15
+ ## strip inline (until end-of-line) comments too
16
+ ## e.g. Janvier Janv Jan ## check janv in use??
17
+ ## => Janvier Janv Jan
18
+
19
+ line = line.sub( /#.*/, '' ).strip
20
+ ## pp line
21
+
22
+ values = line.split( /[ \t]+/ )
23
+ ## pp values
24
+
25
+ ## todo/fix -- add check for duplicates
26
+ lines << values
27
+ end
28
+ lines
29
+
30
+ end # method parse
31
+
32
+
33
+ def self.build_names( lines )
34
+ ## join all words together into a single string e.g.
35
+ ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
36
+ lines.map { |line| line.join('|') }.join('|')
37
+ end
38
+
39
+
40
+
41
+ ## add normalize option (for downcase) - why? why not?
42
+ def self.build_map( lines )
43
+ ## note: downcase name!!!
44
+ ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
45
+ ## {"january" => 1, "jan" => 1,
46
+ ## "february" => 2, "feb" => 2,
47
+ ## "march" => 3, "mar" => 3,
48
+ ## "april" => 4, "apr" => 4,
49
+ ## "may" => 5,
50
+ ## "june" => 6, "jun" => 6, ...
51
+ lines.each_with_index.reduce( {} ) do |h,(line,i)|
52
+ line.each { |name| h[ name.downcase ] = i+1 } ## note: start mapping with 1 (and NOT zero-based, that is, 0)
53
+ h
54
+ end
55
+ end
56
+
57
+
58
+ ## note - support only 5 letter max for now
59
+ ## now January|February|August etc.
60
+ MONTH_LINES = parse_names( <<TXT )
61
+ Jan
62
+ Feb
63
+ March Mar
64
+ April Apr
65
+ May
66
+ June Jun
67
+ July Jul
68
+ Aug
69
+ Sept Sep
70
+ Oct
71
+ Nov
72
+ Dec
73
+ TXT
74
+
75
+ MONTH_NAMES = build_names( MONTH_LINES )
76
+ # pp MONTH_NAMES
77
+ MONTH_MAP = build_map( MONTH_LINES )
78
+ # pp MONTH_MAP
79
+
80
+
81
+ ### nnote - only support two or three letters
82
+ ## no Tues | Thur | Thurs | Sunday etc.
83
+ DAY_LINES = parse_names( <<TXT )
84
+ Mon Mo
85
+ Tue Tu
86
+ Wed We
87
+ Thu Th
88
+ Fri Fr
89
+ Sat Sa
90
+ Sun Su
91
+ TXT
92
+
93
+
94
+ DAY_NAMES = build_names( DAY_LINES )
95
+ # pp DAY_NAMES
96
+ DAY_MAP = build_map( DAY_LINES )
97
+ # pp DAY_MAP
98
+
99
+
100
+
101
+ #=>
102
+ # "Jan|Feb|March|Mar|April|Apr|May|June|Jun|
103
+ # July|Jul|Aug|Sept|Sep|Oct|Nov|Dec"
104
+ #
105
+ # "Mon|Mo|Tue|Tu|Wed|We|
106
+ # Thu|Th|Fri|Fr|Sat|Sa|Sun|Su"
107
+
108
+
109
+
110
+ ## todo - add more date variants !!!!
111
+
112
+ # e.g. Fri Aug 9
113
+ DATE_RE = %r{
114
+ ## note - do not include [] in capture for now - why? why not
115
+ ## eat-up/consume optional [] - part i
116
+ (?: \[ | \b
117
+ )
118
+ (?<date>
119
+
120
+ (?: ######
121
+ ## variant I/1/one
122
+ ### Fri June 24
123
+
124
+ ## optional day name
125
+ ((?<day_name>#{DAY_NAMES})
126
+ [ ]
127
+ )?
128
+ ## allow 1 or 2 spaces e.g. Jul 2 / Jun 27 to pretty print
129
+ (?<month_name>#{MONTH_NAMES})
130
+ [ ]{1,2}
131
+ (?<day>\d{1,2})
132
+ ## optional year
133
+ ( [ ]
134
+ (?<year>\d{4})
135
+ )?
136
+ )
137
+ |
138
+ (?: ####
139
+ ## variant II/2/two
140
+ ## 17- 3-22 - allow space befor mont
141
+ ## 17-3-22
142
+ \d{1,2}
143
+ -
144
+ [ ]*\d{1,2}
145
+ -
146
+ (?:
147
+ \d{4} | ## 2024
148
+ \d{2} ## or 24 only
149
+ )
150
+ )
151
+ ) ## end date capture
152
+ ## eat-up/consume optional [] - part ii
153
+ (?: \] | \b
154
+ )
155
+ }ix
156
+
157
+
158
+
159
+ end # class Parser
160
+ end # module Rsssf
161
+
@@ -0,0 +1,68 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ## cannot start with number
6
+ ## cannot have number inside
7
+ ## cannot end with number!!!
8
+ ##
9
+ ## check if can end in dot - why? why not?
10
+ ## e.g. jr. or such?
11
+
12
+ ##
13
+ ## allow 45+/90+ too
14
+ ## or 90+pen or
15
+ ## 90+ pen/90+p/90+ og
16
+
17
+
18
+ MINUTE_RE = %r{
19
+ (?<minute>
20
+ \b
21
+ \d{1,3}
22
+ '? ## optional minute quote (')
23
+ (?:
24
+ # optional offset/extra e.g. 45+ / 90+ or 45+10 / 90+5
25
+ (?: \+
26
+ (?:
27
+ (?! [0-9]) ## negative look ahead (not a number) required
28
+ |
29
+ (?:
30
+ \d{1,3}
31
+ '? ## optional minute quote (')
32
+ (?= (og|pen|p)? ([ ;,\]]|$))
33
+ )
34
+ )
35
+ )
36
+ |
37
+ (?= (og|pen|p)? ([ ;,\]]|$)) # note - break can be og|pen|p too
38
+ )
39
+ )}ix
40
+ ### note - word boundary (\b) will NOT work for quoet (')
41
+ ## because quote is NOT alphanum (like dot etc.)
42
+
43
+
44
+
45
+ ## goal types
46
+ GOAL_PEN_RE = %r{
47
+ (?<pen>
48
+ (?<=\d|\+|[ ]|') ## must follow a number or plus (e.g. 45p / 45+p / 45 p / 45'p) or space
49
+ (?: pen|p )
50
+ \b
51
+ )
52
+ }ix
53
+
54
+
55
+ GOAL_OG_RE = %r{
56
+ (?<og>
57
+ (?<=\d|\+|[ ]|') ## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space
58
+ og
59
+ \b
60
+ )
61
+ }ix
62
+
63
+
64
+
65
+
66
+ end # class Parser
67
+ end # module Rsssf
68
+
@@ -0,0 +1,113 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ###
6
+ ## move to token-note(s) file !!!!
7
+ ##
8
+
9
+ NOTE_RE = %r{
10
+ \[
11
+ (?<note>
12
+ (?: ## starting with ___ PLUS requiring more text
13
+ (?:
14
+ nb:
15
+ ## e.g. [NB: between top-8 of regular season]
16
+ # [NB: América, Morelia and Tigres qualified on better record regular season]
17
+ # [NB: Celaya qualified on away goals]
18
+ # [NB: Alebrijes qualified on away goal]
19
+ # [NB: Leones Negros qualified on away goals]
20
+ #
21
+ # todo/fix:
22
+ # add "top-level" NB: version
23
+ ## with full (end-of) line note - why? why not?
24
+ |
25
+ (?: originally[ ])? scheduled
26
+ ## e.g. [originally scheduled to play in Mexico City]
27
+ |
28
+ rescheduled
29
+ ## e.g. [Rescheduled due to earthquake occurred in Mexico on September 19]
30
+ |
31
+ remaining
32
+ ## e.g. [remaining 79']
33
+ ## [remaining 84']
34
+ ## [remaining 59']
35
+ ## [remaining 5']
36
+ |
37
+ played
38
+ ## e.g. [played in Macaé-RJ]
39
+ ## [played in Caxias do Sul-RS]
40
+ ## [played in Sete Lagoas-MG]
41
+ ## [played in Uberlândia-MG]
42
+ ## [played in Brasília-DF]
43
+ ## [played in Vöcklabruck]
44
+ ## [played in Pasching]
45
+ |
46
+ declared
47
+ ## e.g. [declared void]
48
+ |
49
+ inter-group
50
+ ## e.g. [inter-group A-B]
51
+ ## [inter-group C-D]
52
+ )
53
+ [ ]
54
+ [^\]]+? ## slurp all to next ] - (use non-greedy)
55
+ )
56
+ |
57
+ (?:
58
+ ## starting with in - do NOT allow digits
59
+ ## name starting with in possible - why? why not?
60
+ in[ ]
61
+ [^0-9\]]+?
62
+ ## e.g. [In Estadio La Corregidora]
63
+ ## [in Unidad Deportiva Centenario]
64
+ ## [in Estadio Olímpico Universitario]
65
+ ## [in Estadio Victoria]
66
+ ## [in UD José Brindis]
67
+ ## [in Colomos Alfredo "Pistache" Torres stadium]
68
+ )
69
+ |
70
+ (?:
71
+ (?:
72
+ postponed
73
+ ## e.g. [postponed due to problems with the screen of the stadium]
74
+ ## [postponed by storm]
75
+ ## [postponed due to tropical storm "Hanna"]
76
+ ## [postponed from Sep 10-12 due to death Queen Elizabeth II]
77
+ ## [postponed] -- include why? why not?
78
+ |
79
+ awarded
80
+ ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
81
+ ## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
82
+ ## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
83
+ |
84
+ abandoned
85
+ ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
86
+ ## [abandoned at 0-0 in 6' due to waterlogged pitch]
87
+ ## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
88
+ ## [abandoned at 1-0 in 31']
89
+ ## [abandoned at 0-1' in 85 due to crowd trouble]
90
+ |
91
+ suspended
92
+ ## e.g. [suspended at 0-0 in 12' due to storm]
93
+ ## [suspended at 84' by storm; result stood]
94
+ |
95
+ annulled
96
+ ## e.g. [annulled]
97
+ |
98
+ replay
99
+ ## e.g. [replay]
100
+ )
101
+ ([ ] ## note - optional text
102
+ [^\]]+?
103
+ )? ## slurp all to next ] - (use non-greedy)
104
+ )
105
+ ) # note capture
106
+ \]
107
+ }ix
108
+
109
+
110
+
111
+ end # class Parser
112
+ end # module Rsssf
113
+