rsssf-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1b6cfe7842f0f46d242c1c2fc1f52b4c032b5c25fce314939583c7f96a486c65
4
+ data.tar.gz: ba5244b284f65129dca3b35e87d10984e1bf8906e571b3e42e85a4615eecb733
5
+ SHA512:
6
+ metadata.gz: 9a63d121c858e35f757b59c490fc05cfd1457ac5c6e3294a291db787da141061f046c5ce2342fdf275e64bbb647934ef43d8547c2aa53eef537d12405d746185
7
+ data.tar.gz: e7a8f41d2d53e63fb72e35f22b1a0a0da370e15ab1b4aedf2ae1f37a2a1bc277ff31deb8362e541bbc403f7d49ea3f1120b4b271b9d36982359141ca80617d72
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ### 0.0.1 / 2024-07-17
2
+
3
+ * Everything is new. First release.
data/Manifest.txt ADDED
@@ -0,0 +1,15 @@
1
+ CHANGELOG.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ bin/rsssf
6
+ lib/rsssf/parser.rb
7
+ lib/rsssf/parser/linter.rb
8
+ lib/rsssf/parser/parser.rb
9
+ lib/rsssf/parser/token-date.rb
10
+ lib/rsssf/parser/token-goals.rb
11
+ lib/rsssf/parser/token-note.rb
12
+ lib/rsssf/parser/token-round.rb
13
+ lib/rsssf/parser/token-score.rb
14
+ lib/rsssf/parser/token-text.rb
15
+ lib/rsssf/parser/token.rb
data/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'hoe'
2
+
3
+
4
+ Hoe.spec 'rsssf-parser' do
5
+
6
+ self.version = '0.0.1'
7
+
8
+ self.summary = "rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions"
9
+ self.description = summary
10
+
11
+ self.urls = { home: 'https://github.com/sportdb/sport.db' }
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'gerald.bauer@gmail.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'CHANGELOG.md'
19
+
20
+ self.licenses = ['Public Domain']
21
+
22
+ self.extra_deps = [
23
+ ['sportdb-parser'], ### (re(use standard football.txt parser machinery - why? why not?
24
+ ]
25
+
26
+ self.spec_extras = {
27
+ required_ruby_version: '>= 2.2.2'
28
+ }
29
+ end
data/bin/rsssf ADDED
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ## tip: to test run:
4
+ ## ruby -I ./lib -I ../parser/lib bin/rsssf
5
+
6
+ require 'rsssf/parser'
7
+
8
+
9
+ require 'optparse'
10
+
11
+
12
+ args = ARGV
13
+ opts = { debug: false,
14
+ metal: false }
15
+
16
+ parser = OptionParser.new do |parser|
17
+ parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
18
+
19
+ ##
20
+ ## check if git has a offline option?? (use same)
21
+ ## check for other tools - why? why not?
22
+
23
+
24
+ parser.on( "--verbose", "--debug",
25
+ "turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
26
+ opts[:debug] = debug
27
+ end
28
+
29
+ parser.on( "--metal",
30
+ "turn off typed parse tree; show to the metal tokens"+
31
+ " (default: #{opts[:metal]})" ) do |metal|
32
+ opts[:metal] = metal
33
+ end
34
+ end
35
+ parser.parse!( args )
36
+
37
+ puts "OPTS:"
38
+ p opts
39
+ puts "ARGV:"
40
+ p args
41
+
42
+
43
+ paths = if args.empty?
44
+ [
45
+ '../../../rsssf/austria/2010-11/cup.txt',
46
+ ]
47
+ else
48
+ ## todo/fix - expand_args!!!
49
+ args
50
+ end
51
+
52
+
53
+ pp paths
54
+
55
+
56
+
57
+
58
+ Rsssf::Parser::Linter.debug = true if opts[:debug]
59
+
60
+ linter = Rsssf::Parser::Linter.new
61
+
62
+
63
+ paths.each_with_index do |path,i|
64
+ puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
65
+ linter.read( path, parse: !opts[:metal] )
66
+ end
67
+
68
+ if linter.errors?
69
+ puts
70
+ pp linter.errors
71
+ puts "!! #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
72
+ else
73
+ puts "OK no parse errors found in #{paths.size} datafile(s)"
74
+ end
75
+
76
+ puts "bye"
77
+
78
+
79
+
80
+
@@ -0,0 +1,84 @@
1
+
2
+ module Rsssf
3
+ class Parser
4
+
5
+ ###
6
+ ## note - Linter for now nested inside Parser - keep? why? why not?
7
+ class Linter
8
+
9
+ def self.debug=(value) @@debug = value; end
10
+ def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
+ def debug?() self.class.debug?; end
12
+
13
+
14
+
15
+
16
+ attr_reader :errors
17
+
18
+ def initialize
19
+ @errors = []
20
+ @parser = Parser.new ## use own parser instance (not shared) - why? why not?
21
+ end
22
+
23
+
24
+ def errors?() @errors.size > 0; end
25
+
26
+
27
+
28
+ #########
29
+ ## parse - false (default) - tokenize (only)
30
+ ## - true - tokenize & parse
31
+ def read( path, parse: false )
32
+
33
+ ## fix - (re)use outline reader later!!!
34
+ ## plus check for headings etc.
35
+
36
+ text = File.open( path, 'r:utf-8' ) { |f| f.read }
37
+ lines = text.split( "\n" )
38
+
39
+
40
+ ## process lines
41
+ tree = []
42
+ lines.each do |line|
43
+
44
+ ## skip blank and comment lines
45
+ next if line.strip.empty? || line.strip.start_with?('#')
46
+
47
+ ## strip inline (end-of-line) comments
48
+ line = line.sub( /#.+$/, '' )
49
+
50
+
51
+ if debug?
52
+ puts
53
+ puts "line >#{line}<"
54
+ end
55
+
56
+ t, error_messages = if parse
57
+ @parser.parse_with_errors( line )
58
+ else
59
+ @parser.tokenize_with_errors( line )
60
+ end
61
+
62
+
63
+ if error_messages.size > 0
64
+ ## add to "global" error list
65
+ ## make a triplet tuple (file / msg / line text)
66
+ error_messages.each do |msg|
67
+ @errors << [ path,
68
+ msg,
69
+ line
70
+ ]
71
+ end
72
+ end
73
+
74
+ pp t if debug?
75
+
76
+ tree << t
77
+ end
78
+ ## pp tree
79
+ end # read
80
+ end # class Linter
81
+
82
+
83
+ end # class Parser
84
+ end # module Rsssf
@@ -0,0 +1,100 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ## transforms
6
+ ##
7
+ ## Netherlands 1-2 (1-1) England
8
+ ## => text => team
9
+ ## score|vs
10
+ ## text => team
11
+
12
+
13
+
14
+ ##
15
+ ##
16
+ ## add !!!!
17
+ ## collect_until e.g. collect_until( :text )
18
+
19
+
20
+ def parse_with_errors( line, debug: false )
21
+ errors = []
22
+ tokens, token_errors = tokenize_with_errors( line )
23
+ errors += token_errors
24
+
25
+
26
+ =begin
27
+ #############
28
+ ## pass 1
29
+ ## replace all texts with keyword matches (e.g. group, round, leg, etc.)
30
+ tokens = tokens.map do |t|
31
+ if t[0] == :text
32
+ text = t[1]
33
+ if is_group?( text )
34
+ ### expects to be followed by num (or text ABC??)
35
+ [:group, text]
36
+ elsif is_matchday?( text )
37
+ ### expects to be followed by num
38
+ ## use different name e.g. :fix_round or such?
39
+ [:matchday, text]
40
+ elsif is_leg?( text )
41
+ [:leg, text]
42
+ elsif is_round?( text )
43
+ [:round, text]
44
+ else
45
+ t ## pass through as-is (1:1)
46
+ end
47
+ else
48
+ t
49
+ end
50
+ end
51
+
52
+
53
+ ## puts "tokens:"
54
+ ## pp tokens
55
+ =end
56
+
57
+ ## transform tokens into (parse tree/ast) nodes
58
+ nodes = []
59
+
60
+ ## note - (re)use token buffer from "standard" parser here !!!!
61
+ buf = SportDb::Parser::Tokens.new( tokens )
62
+ ## pp buf
63
+
64
+
65
+ loop do
66
+ if buf.match?( :text, [:score,
67
+ :score_awd,
68
+ :score_abd,
69
+ :score_ppd,
70
+ :score_np,
71
+ :score_wo,
72
+ :vs], :text )
73
+ nodes << [:team, buf.next[1]]
74
+ nodes << buf.next
75
+ nodes << [:team, buf.next[1]]
76
+ elsif buf.match?( :text, :minute ) ## assume player+minute
77
+ nodes << [:player, buf.next[1]]
78
+ nodes << buf.next
79
+ else
80
+ ## pass through
81
+ nodes << buf.next
82
+ end
83
+
84
+ break if buf.eos?
85
+ end
86
+
87
+ [nodes,errors]
88
+ end
89
+
90
+
91
+ ### convience helper - ignore errors by default
92
+ def parse( line, debug: false )
93
+ nodes, _ = parse_with_errors( line, debug: debug )
94
+ nodes
95
+ end
96
+
97
+
98
+ end # class Parser
99
+ end # module Rsssf
100
+
@@ -0,0 +1,161 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+
6
+ def self.parse_names( txt )
7
+ lines = [] # array of lines (with words)
8
+
9
+ txt.each_line do |line|
10
+ line = line.strip
11
+
12
+ next if line.empty?
13
+ next if line.start_with?( '#' ) ## skip comments too
14
+
15
+ ## strip inline (until end-of-line) comments too
16
+ ## e.g. Janvier Janv Jan ## check janv in use??
17
+ ## => Janvier Janv Jan
18
+
19
+ line = line.sub( /#.*/, '' ).strip
20
+ ## pp line
21
+
22
+ values = line.split( /[ \t]+/ )
23
+ ## pp values
24
+
25
+ ## todo/fix -- add check for duplicates
26
+ lines << values
27
+ end
28
+ lines
29
+
30
+ end # method parse
31
+
32
+
33
+ def self.build_names( lines )
34
+ ## join all words together into a single string e.g.
35
+ ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
36
+ lines.map { |line| line.join('|') }.join('|')
37
+ end
38
+
39
+
40
+
41
+ ## add normalize option (for downcase) - why? why not?
42
+ def self.build_map( lines )
43
+ ## note: downcase name!!!
44
+ ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
45
+ ## {"january" => 1, "jan" => 1,
46
+ ## "february" => 2, "feb" => 2,
47
+ ## "march" => 3, "mar" => 3,
48
+ ## "april" => 4, "apr" => 4,
49
+ ## "may" => 5,
50
+ ## "june" => 6, "jun" => 6, ...
51
+ lines.each_with_index.reduce( {} ) do |h,(line,i)|
52
+ line.each { |name| h[ name.downcase ] = i+1 } ## note: start mapping with 1 (and NOT zero-based, that is, 0)
53
+ h
54
+ end
55
+ end
56
+
57
+
58
+ ## note - support only 5 letter max for now
59
+ ## now January|February|August etc.
60
+ MONTH_LINES = parse_names( <<TXT )
61
+ Jan
62
+ Feb
63
+ March Mar
64
+ April Apr
65
+ May
66
+ June Jun
67
+ July Jul
68
+ Aug
69
+ Sept Sep
70
+ Oct
71
+ Nov
72
+ Dec
73
+ TXT
74
+
75
+ MONTH_NAMES = build_names( MONTH_LINES )
76
+ # pp MONTH_NAMES
77
+ MONTH_MAP = build_map( MONTH_LINES )
78
+ # pp MONTH_MAP
79
+
80
+
81
+ ### nnote - only support two or three letters
82
+ ## no Tues | Thur | Thurs | Sunday etc.
83
+ DAY_LINES = parse_names( <<TXT )
84
+ Mon Mo
85
+ Tue Tu
86
+ Wed We
87
+ Thu Th
88
+ Fri Fr
89
+ Sat Sa
90
+ Sun Su
91
+ TXT
92
+
93
+
94
+ DAY_NAMES = build_names( DAY_LINES )
95
+ # pp DAY_NAMES
96
+ DAY_MAP = build_map( DAY_LINES )
97
+ # pp DAY_MAP
98
+
99
+
100
+
101
+ #=>
102
+ # "Jan|Feb|March|Mar|April|Apr|May|June|Jun|
103
+ # July|Jul|Aug|Sept|Sep|Oct|Nov|Dec"
104
+ #
105
+ # "Mon|Mo|Tue|Tu|Wed|We|
106
+ # Thu|Th|Fri|Fr|Sat|Sa|Sun|Su"
107
+
108
+
109
+
110
+ ## todo - add more date variants !!!!
111
+
112
+ # e.g. Fri Aug 9
113
+ DATE_RE = %r{
114
+ ## note - do not include [] in capture for now - why? why not
115
+ ## eat-up/consume optional [] - part i
116
+ (?: \[ | \b
117
+ )
118
+ (?<date>
119
+
120
+ (?: ######
121
+ ## variant I/1/one
122
+ ### Fri June 24
123
+
124
+ ## optional day name
125
+ ((?<day_name>#{DAY_NAMES})
126
+ [ ]
127
+ )?
128
+ ## allow 1 or 2 spaces e.g. Jul 2 / Jun 27 to pretty print
129
+ (?<month_name>#{MONTH_NAMES})
130
+ [ ]{1,2}
131
+ (?<day>\d{1,2})
132
+ ## optional year
133
+ ( [ ]
134
+ (?<year>\d{4})
135
+ )?
136
+ )
137
+ |
138
+ (?: ####
139
+ ## variant II/2/two
140
+ ## 17- 3-22 - allow space befor mont
141
+ ## 17-3-22
142
+ \d{1,2}
143
+ -
144
+ [ ]*\d{1,2}
145
+ -
146
+ (?:
147
+ \d{4} | ## 2024
148
+ \d{2} ## or 24 only
149
+ )
150
+ )
151
+ ) ## end date capture
152
+ ## eat-up/consume optional [] - part ii
153
+ (?: \] | \b
154
+ )
155
+ }ix
156
+
157
+
158
+
159
+ end # class Parser
160
+ end # module Rsssf
161
+
@@ -0,0 +1,68 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ## cannot start with number
6
+ ## cannot have number inside
7
+ ## cannot end with number!!!
8
+ ##
9
+ ## check if can end in dot - why? why not?
10
+ ## e.g. jr. or such?
11
+
12
+ ##
13
+ ## allow 45+/90+ too
14
+ ## or 90+pen or
15
+ ## 90+ pen/90+p/90+ og
16
+
17
+
18
+ MINUTE_RE = %r{
19
+ (?<minute>
20
+ \b
21
+ \d{1,3}
22
+ '? ## optional minute quote (')
23
+ (?:
24
+ # optional offset/extra e.g. 45+ / 90+ or 45+10 / 90+5
25
+ (?: \+
26
+ (?:
27
+ (?! [0-9]) ## negative look ahead (not a number) required
28
+ |
29
+ (?:
30
+ \d{1,3}
31
+ '? ## optional minute quote (')
32
+ (?= (og|pen|p)? ([ ;,\]]|$))
33
+ )
34
+ )
35
+ )
36
+ |
37
+ (?= (og|pen|p)? ([ ;,\]]|$)) # note - break can be og|pen|p too
38
+ )
39
+ )}ix
40
+ ### note - word boundary (\b) will NOT work for quoet (')
41
+ ## because quote is NOT alphanum (like dot etc.)
42
+
43
+
44
+
45
+ ## goal types
46
+ GOAL_PEN_RE = %r{
47
+ (?<pen>
48
+ (?<=\d|\+|[ ]|') ## must follow a number or plus (e.g. 45p / 45+p / 45 p / 45'p) or space
49
+ (?: pen|p )
50
+ \b
51
+ )
52
+ }ix
53
+
54
+
55
+ GOAL_OG_RE = %r{
56
+ (?<og>
57
+ (?<=\d|\+|[ ]|') ## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space
58
+ og
59
+ \b
60
+ )
61
+ }ix
62
+
63
+
64
+
65
+
66
+ end # class Parser
67
+ end # module Rsssf
68
+
@@ -0,0 +1,113 @@
1
+ module Rsssf
2
+ class Parser
3
+
4
+
5
+ ###
6
+ ## move to token-note(s) file !!!!
7
+ ##
8
+
9
+ NOTE_RE = %r{
10
+ \[
11
+ (?<note>
12
+ (?: ## starting with ___ PLUS requiring more text
13
+ (?:
14
+ nb:
15
+ ## e.g. [NB: between top-8 of regular season]
16
+ # [NB: América, Morelia and Tigres qualified on better record regular season]
17
+ # [NB: Celaya qualified on away goals]
18
+ # [NB: Alebrijes qualified on away goal]
19
+ # [NB: Leones Negros qualified on away goals]
20
+ #
21
+ # todo/fix:
22
+ # add "top-level" NB: version
23
+ ## with full (end-of) line note - why? why not?
24
+ |
25
+ (?: originally[ ])? scheduled
26
+ ## e.g. [originally scheduled to play in Mexico City]
27
+ |
28
+ rescheduled
29
+ ## e.g. [Rescheduled due to earthquake occurred in Mexico on September 19]
30
+ |
31
+ remaining
32
+ ## e.g. [remaining 79']
33
+ ## [remaining 84']
34
+ ## [remaining 59']
35
+ ## [remaining 5']
36
+ |
37
+ played
38
+ ## e.g. [played in Macaé-RJ]
39
+ ## [played in Caxias do Sul-RS]
40
+ ## [played in Sete Lagoas-MG]
41
+ ## [played in Uberlândia-MG]
42
+ ## [played in Brasília-DF]
43
+ ## [played in Vöcklabruck]
44
+ ## [played in Pasching]
45
+ |
46
+ declared
47
+ ## e.g. [declared void]
48
+ |
49
+ inter-group
50
+ ## e.g. [inter-group A-B]
51
+ ## [inter-group C-D]
52
+ )
53
+ [ ]
54
+ [^\]]+? ## slurp all to next ] - (use non-greedy)
55
+ )
56
+ |
57
+ (?:
58
+ ## starting with in - do NOT allow digits
59
+ ## name starting with in possible - why? why not?
60
+ in[ ]
61
+ [^0-9\]]+?
62
+ ## e.g. [In Estadio La Corregidora]
63
+ ## [in Unidad Deportiva Centenario]
64
+ ## [in Estadio Olímpico Universitario]
65
+ ## [in Estadio Victoria]
66
+ ## [in UD José Brindis]
67
+ ## [in Colomos Alfredo "Pistache" Torres stadium]
68
+ )
69
+ |
70
+ (?:
71
+ (?:
72
+ postponed
73
+ ## e.g. [postponed due to problems with the screen of the stadium]
74
+ ## [postponed by storm]
75
+ ## [postponed due to tropical storm "Hanna"]
76
+ ## [postponed from Sep 10-12 due to death Queen Elizabeth II]
77
+ ## [postponed] -- include why? why not?
78
+ |
79
+ awarded
80
+ ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
81
+ ## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
82
+ ## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
83
+ |
84
+ abandoned
85
+ ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
86
+ ## [abandoned at 0-0 in 6' due to waterlogged pitch]
87
+ ## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
88
+ ## [abandoned at 1-0 in 31']
89
+ ## [abandoned at 0-1' in 85 due to crowd trouble]
90
+ |
91
+ suspended
92
+ ## e.g. [suspended at 0-0 in 12' due to storm]
93
+ ## [suspended at 84' by storm; result stood]
94
+ |
95
+ annulled
96
+ ## e.g. [annulled]
97
+ |
98
+ replay
99
+ ## e.g. [replay]
100
+ )
101
+ ([ ] ## note - optional text
102
+ [^\]]+?
103
+ )? ## slurp all to next ] - (use non-greedy)
104
+ )
105
+ ) # note capture
106
+ \]
107
+ }ix
108
+
109
+
110
+
111
+ end # class Parser
112
+ end # module Rsssf
113
+