sportdb-parser 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c9225b21f400b9f9cced2052c3062f41a091ed81d3d4239164c9652f53ebc6e
4
- data.tar.gz: f7250eaa21324962df27e7cdd397857afa570c610f00c80c31e5105e40964002
3
+ metadata.gz: 3721d30a9ec1145f59d6bc84bb9a6cf81330fdafa00314decc2c165f2c6b92c1
4
+ data.tar.gz: a50b917e0bc5db3ac21cb4ead5507233cbd91f4b6f3b52668ef74ba8c6db6140
5
5
  SHA512:
6
- metadata.gz: 471c938c233d8f81d7a0fd5e4470a27a52486906764816b6c35ea3d88e19650c81302fd5ff9ee30b85d3a8e9f81ada8eef20b49bd3de924c7238acb106ba6082
7
- data.tar.gz: 24d1cf3846404859ad7e751895325b256321d43e2881413fda6325c744ca0c31b52ef2032a9dfc8e56e67d7a06df54a6d2780a297982440b8e40b7055fe06c26
6
+ metadata.gz: 3e286842bcb5c163d841d414a15d4cbd91359324b88106603e22ebbec19b4cb8ffff3119d92a5c9e302e6aeabbc8a7ba74975efebee11ba275ce1d49f0286c1f
7
+ data.tar.gz: f237aedda025eb35bb08b31345941a3f1f073cbf8c3495c1afcf6e45de73f067e47a8a4c55981d5881079a5ca45a04af427908f379555208c98851c5d4751aa1
data/CHANGELOG.md CHANGED
@@ -1,4 +1,4 @@
1
- ### 0.2.2
1
+ ### 0.3.1
2
2
 
3
3
  ### 0.0.1 / 2024-07-12
4
4
 
data/Manifest.txt CHANGED
@@ -2,8 +2,11 @@ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
+ bin/fbtok
5
6
  lib/sportdb/parser.rb
6
7
  lib/sportdb/parser/lang.rb
8
+ lib/sportdb/parser/linter.rb
9
+ lib/sportdb/parser/outline_reader.rb
7
10
  lib/sportdb/parser/parser.rb
8
11
  lib/sportdb/parser/token-date.rb
9
12
  lib/sportdb/parser/token-score.rb
data/bin/fbtok ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ## tip: to test run:
4
+ ## ruby -I ./lib bin/fbtok
5
+
6
+ require 'sportdb/parser'
7
+
8
+
9
+ require 'optparse' ## check - already auto-required in cocos? keep? why? why not?
10
+
11
+
12
+ args=ARGV
13
+
14
+
15
+ opts = {
16
+ debug: true,
17
+ metal: false,
18
+ }
19
+
20
+ parser = OptionParser.new do |parser|
21
+ parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
22
+
23
+ parser.on( "--verbose", "--debug",
24
+ "turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
25
+ opts[:debug] = true
26
+ end
27
+
28
+ parser.on( "--metal",
29
+ "turn off typed parse tree; show to the metal tokens"+
30
+ " (default: #{opts[:metal]})" ) do |metal|
31
+ opts[:metal] = true
32
+ end
33
+ end
34
+ parser.parse!( args )
35
+
36
+ puts "OPTS:"
37
+ p opts
38
+ puts "ARGV:"
39
+ p args
40
+
41
+
42
+ SportDb::Parser::Linter.debug = true if opts[:debug]
43
+
44
+ linter = SportDb::Parser::Linter.new
45
+ errors = []
46
+
47
+ paths = args
48
+ paths.each_with_index do |path,i|
49
+ puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
50
+ linter.read( path, parse: !opts[:metal] )
51
+
52
+ errors += linter.errors if linter.errors?
53
+ end
54
+
55
+ if errors.size > 0
56
+ puts
57
+ pp errors
58
+ puts
59
+ puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
60
+ else
61
+ puts
62
+ puts "OK no parse errors found in #{paths.size} datafile(s)"
63
+ end
64
+
65
+
66
+ puts "bye"
67
+
@@ -64,6 +64,9 @@ ROUND_RE = %r{^(
64
64
  [ ] Round
65
65
  )
66
66
  |
67
+ ## Playoff Round 1
68
+ (?: Play-?off [ ] Round [ ] [1-9][0-9]* )
69
+ |
67
70
  ## starting with preliminary
68
71
  # e.g. Preliminary round
69
72
  (?: Preliminary [ ]
@@ -0,0 +1,156 @@
1
+
2
+ module SportDb
3
+ class Parser
4
+
5
+ ###
6
+ ## note - Linter for now nested inside Parser - keep? why? why not?
7
+ class Linter
8
+
9
+ def self.debug=(value) @@debug = value; end
10
+ def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
+ def debug?() self.class.debug?; end
12
+
13
+
14
+
15
+ attr_reader :errors
16
+
17
+ def initialize
18
+ @errors = []
19
+ @parser = Parser.new ## use own parser instance (not shared) - why? why not?
20
+ end
21
+
22
+
23
+ def errors?() @errors.size > 0; end
24
+
25
+
26
+
27
+ ## note: colon (:) MUST be followed by one (or more) spaces
28
+ ## make sure mon feb 12 18:10 will not match
29
+ ## allow 1. FC Köln etc.
30
+ ## Mainz 05:
31
+ ## limit to 30 chars max
32
+ ## only allow chars incl. intl buut (NOT ()[]/;)
33
+ ##
34
+ ## Group A:
35
+ ## Group B: - remove colon
36
+ ## or lookup first
37
+
38
+ ATTRIB_RE = %r{^
39
+ [ ]*? # slurp leading spaces
40
+ (?<key>[^:|\]\[()\/; -]
41
+ [^:|\]\[()\/;]{0,30}
42
+ )
43
+ [ ]*? # slurp trailing spaces
44
+ :[ ]+
45
+ (?<value>.+)
46
+ [ ]*? # slurp trailing spaces
47
+ $
48
+ }ix
49
+
50
+
51
+ #########
52
+ ## parse - false (default) - tokenize (only)
53
+ ## - true - tokenize & parse
54
+ def read( path, parse: false )
55
+ ## note: every (new) read call - resets errors list to empty
56
+ @errors = []
57
+
58
+ nodes = OutlineReader.read( path )
59
+
60
+ ## process nodes
61
+ h1 = nil
62
+ h2 = nil
63
+ orphans = 0 ## track paragraphs's with no heading
64
+
65
+ attrib_found = false
66
+
67
+
68
+ nodes.each do |node|
69
+ type = node[0]
70
+
71
+ if type == :h1
72
+ h1 = node[1] ## get heading text
73
+ puts " = Heading 1 >#{node[1]}<"
74
+ elsif type == :h2
75
+ if h1.nil?
76
+ puts "!! WARN - no heading for subheading; skipping parse"
77
+ next
78
+ end
79
+ h2 = node[1] ## get heading text
80
+ puts " == Heading 2 >#{node[1]}<"
81
+ elsif type == :p
82
+
83
+ if h1.nil?
84
+ orphans += 1 ## only warn once
85
+ puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
86
+ next
87
+ end
88
+
89
+ lines = node[1]
90
+
91
+ tree = []
92
+ lines.each_with_index do |line,i|
93
+
94
+ if debug?
95
+ puts
96
+ puts "line >#{line}<"
97
+ end
98
+
99
+
100
+ ## skip new (experimental attrib syntax)
101
+ if attrib_found == false &&
102
+ ATTRIB_RE.match?( line )
103
+ ## note: check attrib regex AFTER group def e.g.:
104
+ ## Group A:
105
+ ## Group B: etc.
106
+ ## todo/fix - change Group A: to Group A etc.
107
+ ## Group B: to Group B
108
+ attrib_found = true
109
+ ## logger.debug "skipping key/value line - >#{line}<"
110
+ next
111
+ end
112
+
113
+ if attrib_found
114
+ ## check if line ends with dot
115
+ ## if not slurp up lines to the next do!!!
116
+ ## logger.debug "skipping key/value line - >#{line}<"
117
+ attrib_found = false if line.end_with?( '.' )
118
+ # logger.debug "skipping key/value line (cont.) - >#{line}<"
119
+ next
120
+ end
121
+
122
+ t, error_messages = if parse
123
+ @parser.parse_with_errors( line )
124
+ else
125
+ @parser.tokenize_with_errors( line )
126
+ end
127
+
128
+
129
+ if error_messages.size > 0
130
+ ## add to "global" error list
131
+ ## make a triplet tuple (file / msg / line text)
132
+ error_messages.each do |msg|
133
+ @errors << [ path,
134
+ msg,
135
+ line
136
+ ]
137
+ end
138
+ end
139
+
140
+ pp t if debug?
141
+
142
+ tree << t
143
+ end
144
+
145
+ ## pp tree
146
+ else
147
+ pp node
148
+ raise ArgumentError, "unsupported (node) type >#{type}<"
149
+ end
150
+ end # each node
151
+ end # read
152
+ end # class Linter
153
+
154
+
155
+ end # class Parser
156
+ end # module SportDb
@@ -0,0 +1,97 @@
1
+
2
+
3
+ module SportDb
4
+
5
+ class OutlineReader
6
+
7
+ def self.debug=(value) @@debug = value; end
8
+ def self.debug?() @@debug ||= false; end
9
+ def debug?() self.class.debug?; end
10
+
11
+
12
+
13
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
15
+ parse( txt )
16
+ end
17
+
18
+ def self.parse( txt )
19
+ new( txt ).parse
20
+ end
21
+
22
+ def initialize( txt )
23
+ @txt = txt
24
+ end
25
+
26
+ ## note: skip "decorative" only heading e.g. ========
27
+ ## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
28
+ HEADING_BLANK_RE = %r{\A
29
+ ={1,}
30
+ \z}x
31
+
32
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
33
+ HEADING_RE = %r{\A
34
+ (?<marker>={1,}) ## 1. leading ======
35
+ [ ]*
36
+ (?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
37
+ [ ]*
38
+ =* ## 3. (optional) trailing ====
39
+ \z}x
40
+
41
+ def parse
42
+ outline=[] ## outline structure
43
+ start_para = true ## start new para(graph) on new text line?
44
+
45
+ @txt.each_line do |line|
46
+ line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
47
+
48
+ if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
49
+ start_para = true
50
+ next
51
+ end
52
+
53
+ break if line == '__END__'
54
+
55
+ next if line.start_with?( '#' ) ## skip comments too
56
+ ## strip inline (until end-of-line) comments too
57
+ ## e.g Eupen | KAS Eupen ## [de]
58
+ ## => Eupen | KAS Eupen
59
+ ## e.g bq Bonaire, BOE # CONCACAF
60
+ ## => bq Bonaire, BOE
61
+ line = line.sub( /#.*/, '' ).strip
62
+ pp line if debug?
63
+
64
+ ## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
65
+ next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
66
+
67
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
68
+ if m=HEADING_RE.match( line )
69
+ start_para = true
70
+
71
+ heading_marker = m[:marker]
72
+ heading_level = heading_marker.length ## count number of = for heading level
73
+ heading = m[:text].strip
74
+
75
+ puts "heading #{heading_level} >#{heading}<" if debug?
76
+ outline << [:"h#{heading_level}", heading]
77
+ else ## assume it's a (plain/regular) text line
78
+ if start_para
79
+ outline << [:p, [line]]
80
+ start_para = false
81
+ else
82
+ node = outline[-1] ## get last entry
83
+ if node[0] == :p ## assert it's a p(aragraph) node!!!
84
+ node[1] << line ## add line to p(aragraph)
85
+ else
86
+ puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
87
+ pp node
88
+ exit 1
89
+ end
90
+ end
91
+ end
92
+ end
93
+ outline
94
+ end # method read
95
+ end # class OutlineReader
96
+
97
+ end # module SportDb
@@ -1,12 +1,12 @@
1
- module SportDb
1
+ module SportDb
2
2
  class Parser
3
-
4
-
3
+
4
+
5
5
  ## note - do NOT allow single alpha text for now
6
- ## add later?? A - B C - D - why?
6
+ ## add later?? A - B C - D - why?
7
7
  ## opt 1) one alpha
8
- ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
-
8
+ ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
+
10
10
  ## opt 2) more than one alphanum
11
11
 
12
12
 
@@ -26,19 +26,19 @@ class Parser
26
26
 
27
27
 
28
28
  TEXT_RE = %r{
29
- ## must start with alpha (allow unicode letters!!)
30
- (?<text>
31
- ## positive lookbehind
29
+ ## must start with alpha (allow unicode letters!!)
30
+ (?<text>
31
+ ## positive lookbehind
32
32
  ## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
33
33
  (?<=[ ,;@|\[\]]
34
34
  |^
35
35
  )
36
- (?:
36
+ (?:
37
37
  # opt 1 - start with alpha
38
38
  \p{L}+ ## all unicode letters (e.g. [a-z])
39
39
  |
40
40
 
41
- # opt 2 - start with num!! - allow special case (e.g. 1. FC)
41
+ # opt 2 - start with num!! - allow special case (e.g. 1. FC)
42
42
  \d+ # check for num lookahead (MUST be space or dot)
43
43
  ## MUST be followed by (optional dot) and
44
44
  ## required space !!!
@@ -46,69 +46,79 @@ TEXT_RE = %r{
46
46
  \.? ## optional dot
47
47
  [ ]? ## make space optional too - why? why not?
48
48
  ## yes - eg. 1st, 2nd, 5th etc.
49
- \p{L}+
49
+ \p{L}+
50
50
  )
51
-
51
+
52
52
  (?:(?: (?:[ ]
53
53
  (?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
54
- )
54
+ )
55
55
  | # only single spaces allowed inline!!!
56
- [-]
56
+ [-]
57
57
  )?
58
58
  (?:
59
59
  \p{L} |
60
- [&/']
60
+ [&/']
61
61
  |
62
62
  (?:
63
- \d+
64
- (?![0-9.:h'/+-])
63
+ \d+
64
+ (?![0-9.:h'/+-])
65
65
  ## negative lookahead for numbers
66
66
  ## note - include digits itself!!!
67
- )|
68
- \.
69
- )
67
+ )|
68
+ \.
69
+ )
70
70
  )* ## must NOT end with space or dash(-)
71
71
  ## todo/fix - possible in regex here
72
72
  ## only end in alphanum a-z0-9 (not dot or & ???)
73
73
 
74
-
74
+
75
75
  ## allow optional at the end
76
76
  ## tag or year
77
- ## make it and in the future - why? why not?
78
- ##
77
+ ## make it and in the future - why? why not?
78
+ ##
79
+ ## change - fix
80
+ ## do NOT use (A) for amateur
81
+ ## use A or A. with NO ()!!!
79
82
  ## (A) - allow with predined alpha only for now
80
83
  ## e.g. (A) - amateur a team or b?
84
+ ### same for U21 or U9 etc
85
+ ## use with NO ()!!! - why? why not?
81
86
  ## or U21 U9 etc. - why? why not?
82
87
  ## or etc.
83
88
  ## (1879-1893) or allow years e.g. (1879-1893)
84
- ###
85
- (?:
86
- [ ]
87
- \( (?:
88
- A|B|
89
- U\d{1,2}
90
- )
91
- \)
92
- )?
89
+ ###
90
+ ## add allow country code three to five letters for now
91
+ ## change to generic 1 to 5 - why? why not?
92
+ ## e.g. (A), (I),
93
+ ## (AUT)
94
+ ## (TRNC) five? for UEFA code for northern cyprus
95
+ ## change to 1 to 4 - why? why not?
96
+ ## check - fix possible for upper case only here
97
+ ## inline for this group only?
93
98
  (?:
94
- [ ]
99
+ [ ]
95
100
  \(
96
101
  \d{4}-\d{4}
97
102
  \)
98
- )?
99
-
103
+ )?
104
+ (?:
105
+ [ ]+ ## allow more than once space - why? why not?
106
+ \( (?:
107
+ [A-Z]{1,5}
108
+ )
109
+ \)
110
+ )?
100
111
  ## add lookahead/lookbehind
101
- ## must be space!!!
112
+ ## must be space!!!
102
113
  ## (or comma or start/end of string)
103
114
  ## kind of \b !!!
104
115
  ## positive lookahead
105
116
  (?=[ ,;@|\[\]]
106
117
  |$
107
118
  )
108
- )
119
+ )
109
120
  }ix
110
121
 
111
122
 
112
123
  end # class Parser
113
- end # module SportDb
114
-
124
+ end # module SportDb
@@ -3,8 +3,8 @@ module SportDb
3
3
  module Module
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
- MINOR = 2
7
- PATCH = 2
6
+ MINOR = 3
7
+ PATCH = 1
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -24,6 +24,11 @@ require_relative 'parser/lang'
24
24
  require_relative 'parser/parser'
25
25
 
26
26
 
27
+ ####
28
+ ## todo/check - move outline reader upstream to cocos - why? why not?
29
+ ## use read_outline(), parse_outline() - why? why not?
30
+ require_relative 'parser/outline_reader'
31
+ require_relative 'parser/linter'
27
32
 
28
33
  ###
29
34
  # make parser api (easily) available - why? why not?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-27 00:00:00.000000000 Z
11
+ date: 2024-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos
@@ -74,7 +74,8 @@ dependencies:
74
74
  version: '4.1'
75
75
  description: sportdb-parser - football.txt match parser (& tokenizer)
76
76
  email: gerald.bauer@gmail.com
77
- executables: []
77
+ executables:
78
+ - fbtok
78
79
  extensions: []
79
80
  extra_rdoc_files:
80
81
  - CHANGELOG.md
@@ -85,8 +86,11 @@ files:
85
86
  - Manifest.txt
86
87
  - README.md
87
88
  - Rakefile
89
+ - bin/fbtok
88
90
  - lib/sportdb/parser.rb
89
91
  - lib/sportdb/parser/lang.rb
92
+ - lib/sportdb/parser/linter.rb
93
+ - lib/sportdb/parser/outline_reader.rb
90
94
  - lib/sportdb/parser/parser.rb
91
95
  - lib/sportdb/parser/token-date.rb
92
96
  - lib/sportdb/parser/token-score.rb