sportdb-parser 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c9225b21f400b9f9cced2052c3062f41a091ed81d3d4239164c9652f53ebc6e
4
- data.tar.gz: f7250eaa21324962df27e7cdd397857afa570c610f00c80c31e5105e40964002
3
+ metadata.gz: 9ebb468318f2b87c33ca66afb6c46611ce5f420258e0c41b40a2cbfabcff7a49
4
+ data.tar.gz: 0cf1d511f3e936d73531442d1ca6bef94d90a50ae65346b5b57347d4d294dc77
5
5
  SHA512:
6
- metadata.gz: 471c938c233d8f81d7a0fd5e4470a27a52486906764816b6c35ea3d88e19650c81302fd5ff9ee30b85d3a8e9f81ada8eef20b49bd3de924c7238acb106ba6082
7
- data.tar.gz: 24d1cf3846404859ad7e751895325b256321d43e2881413fda6325c744ca0c31b52ef2032a9dfc8e56e67d7a06df54a6d2780a297982440b8e40b7055fe06c26
6
+ metadata.gz: e992ab97d7ae18c514de14078d30eb36adc40f5044242ce9ace089fb88b104c61b29ff86a2aa8101bb7257c3ff2ce32c6150439ff855e195bee1b26032bb0d9d
7
+ data.tar.gz: 25e66e45e7daf2783bc6507a3cb2c660d9153eab9530210ef51ef6e0d5d3fc531e5891897be3b0492b0ad7ea5fe3d406a0a3dd0559549b85518360d442ed4d8b
data/CHANGELOG.md CHANGED
@@ -1,4 +1,4 @@
1
- ### 0.2.2
1
+ ### 0.3.0
2
2
 
3
3
  ### 0.0.1 / 2024-07-12
4
4
 
data/Manifest.txt CHANGED
@@ -2,8 +2,11 @@ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
+ bin/fbtok
5
6
  lib/sportdb/parser.rb
6
7
  lib/sportdb/parser/lang.rb
8
+ lib/sportdb/parser/linter.rb
9
+ lib/sportdb/parser/outline_reader.rb
7
10
  lib/sportdb/parser/parser.rb
8
11
  lib/sportdb/parser/token-date.rb
9
12
  lib/sportdb/parser/token-score.rb
data/bin/fbtok ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ## tip: to test run:
4
+ ## ruby -I ./lib bin/fbtok
5
+
6
+ require 'sportdb/parser'
7
+
8
+
9
+ require 'optparse' ## check - already auto-required in cocos? keep? why? why not?
10
+
11
+
12
+ args=ARGV
13
+
14
+
15
+ opts = {
16
+ debug: true,
17
+ metal: false,
18
+ }
19
+
20
+ parser = OptionParser.new do |parser|
21
+ parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
22
+
23
+ parser.on( "--verbose", "--debug",
24
+ "turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
25
+ opts[:debug] = debug
26
+ end
27
+
28
+ parser.on( "--metal",
29
+ "turn off typed parse tree; show to the metal tokens"+
30
+ " (default: #{opts[:metal]})" ) do |metal|
31
+ opts[:metal] = metal
32
+ end
33
+ end
34
+ parser.parse!( args )
35
+
36
+ puts "OPTS:"
37
+ p opts
38
+ puts "ARGV:"
39
+ p args
40
+
41
+
42
+ SportDb::Parser::Linter.debug = true if opts[:debug]
43
+
44
+ linter = SportDb::Parser::Linter.new
45
+ errors = []
46
+
47
+ paths = args
48
+ paths.each_with_index do |path,i|
49
+ puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
50
+ linter.read( path, parse: !opts[:metal] )
51
+
52
+ errors += linter.errors if linter.errors?
53
+ end
54
+
55
+ if errors.size > 0
56
+ puts
57
+ pp errors
58
+ puts
59
+ puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
60
+ else
61
+ puts
62
+ puts "OK no parse errors found in #{paths.size} datafile(s)"
63
+ end
64
+
65
+
66
+ puts "bye"
67
+
@@ -0,0 +1,149 @@
1
+
2
+ module SportDb
3
+ class Parser
4
+
5
+ ###
6
+ ## note - Linter for now nested inside Parser - keep? why? why not?
7
+ class Linter
8
+
9
+ def self.debug=(value) @@debug = value; end
10
+ def self.debug?() @@debug ||= false; end ## note: default is FALSE
11
+ def debug?() self.class.debug?; end
12
+
13
+
14
+
15
+ attr_reader :errors
16
+
17
+ def initialize
18
+ @errors = []
19
+ @parser = Parser.new ## use own parser instance (not shared) - why? why not?
20
+ end
21
+
22
+
23
+ def errors?() @errors.size > 0; end
24
+
25
+
26
+
27
+ ## note: colon (:) MUST be followed by one (or more) spaces
28
+ ## make sure mon feb 12 18:10 will not match
29
+ ## allow 1. FC Köln etc.
30
+ ## Mainz 05:
31
+ ## limit to 30 chars max
32
+ ## only allow chars incl. intl buut (NOT ()[]/;)
33
+ ##
34
+ ## Group A:
35
+ ## Group B: - remove colon
36
+ ## or lookup first
37
+
38
+ ATTRIB_RE = %r{^
39
+ [ ]*? # slurp leading spaces
40
+ (?<key>[^:|\]\[()\/; -]
41
+ [^:|\]\[()\/;]{0,30}
42
+ )
43
+ [ ]*? # slurp trailing spaces
44
+ :[ ]+
45
+ (?<value>.+)
46
+ [ ]*? # slurp trailing spaces
47
+ $
48
+ }ix
49
+
50
+
51
+ #########
52
+ ## parse - false (default) - tokenize (only)
53
+ ## - true - tokenize & parse
54
+ def read( path, parse: false )
55
+ ## note: every (new) read call - resets errors list to empty
56
+ @errors = []
57
+
58
+ nodes = OutlineReader.read( path )
59
+
60
+ ## process nodes
61
+ h1 = nil
62
+ orphans = 0 ## track paragraphs's with no heading
63
+
64
+ attrib_found = false
65
+
66
+
67
+ nodes.each do |node|
68
+ type = node[0]
69
+
70
+ if type == :h1
71
+ h1 = node[1] ## get heading text
72
+ puts
73
+ puts " = Heading 1 >#{node[1]}<"
74
+ elsif type == :p
75
+
76
+ if h1.nil?
77
+ orphans += 1 ## only warn once
78
+ puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
79
+ next
80
+ end
81
+
82
+ lines = node[1]
83
+
84
+ tree = []
85
+ lines.each_with_index do |line,i|
86
+
87
+ if debug?
88
+ puts
89
+ puts "line >#{line}<"
90
+ end
91
+
92
+
93
+ ## skip new (experimental attrib syntax)
94
+ if attrib_found == false &&
95
+ ATTRIB_RE.match?( line )
96
+ ## note: check attrib regex AFTER group def e.g.:
97
+ ## Group A:
98
+ ## Group B: etc.
99
+ ## todo/fix - change Group A: to Group A etc.
100
+ ## Group B: to Group B
101
+ attrib_found = true
102
+ ## logger.debug "skipping key/value line - >#{line}<"
103
+ next
104
+ end
105
+
106
+ if attrib_found
107
+ ## check if line ends with dot
108
+ ## if not slurp up lines to the next do!!!
109
+ ## logger.debug "skipping key/value line - >#{line}<"
110
+ attrib_found = false if line.end_with?( '.' )
111
+ # logger.debug "skipping key/value line (cont.) - >#{line}<"
112
+ next
113
+ end
114
+
115
+ t, error_messages = if parse
116
+ @parser.parse_with_errors( line )
117
+ else
118
+ @parser.tokenize_with_errors( line )
119
+ end
120
+
121
+
122
+ if error_messages.size > 0
123
+ ## add to "global" error list
124
+ ## make a triplet tuple (file / msg / line text)
125
+ error_messages.each do |msg|
126
+ @errors << [ path,
127
+ msg,
128
+ line
129
+ ]
130
+ end
131
+ end
132
+
133
+ pp t if debug?
134
+
135
+ tree << t
136
+ end
137
+
138
+ ## pp tree
139
+ else
140
+ pp node
141
+ raise ArgumentError, "unsupported (node) type >#{type}<"
142
+ end
143
+ end # each node
144
+ end # read
145
+ end # class Linter
146
+
147
+
148
+ end # class Parser
149
+ end # module SportDb
@@ -0,0 +1,97 @@
1
+
2
+
3
+ module SportDb
4
+
5
+ class OutlineReader
6
+
7
+ def self.debug=(value) @@debug = value; end
8
+ def self.debug?() @@debug ||= false; end
9
+ def debug?() self.class.debug?; end
10
+
11
+
12
+
13
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
14
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read }
15
+ parse( txt )
16
+ end
17
+
18
+ def self.parse( txt )
19
+ new( txt ).parse
20
+ end
21
+
22
+ def initialize( txt )
23
+ @txt = txt
24
+ end
25
+
26
+ ## note: skip "decorative" only heading e.g. ========
27
+ ## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
28
+ HEADING_BLANK_RE = %r{\A
29
+ ={1,}
30
+ \z}x
31
+
32
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
33
+ HEADING_RE = %r{\A
34
+ (?<marker>={1,}) ## 1. leading ======
35
+ [ ]*
36
+ (?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
37
+ [ ]*
38
+ =* ## 3. (optional) trailing ====
39
+ \z}x
40
+
41
+ def parse
42
+ outline=[] ## outline structure
43
+ start_para = true ## start new para(graph) on new text line?
44
+
45
+ @txt.each_line do |line|
46
+ line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
47
+
48
+ if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
49
+ start_para = true
50
+ next
51
+ end
52
+
53
+ break if line == '__END__'
54
+
55
+ next if line.start_with?( '#' ) ## skip comments too
56
+ ## strip inline (until end-of-line) comments too
57
+ ## e.g Eupen | KAS Eupen ## [de]
58
+ ## => Eupen | KAS Eupen
59
+ ## e.g bq Bonaire, BOE # CONCACAF
60
+ ## => bq Bonaire, BOE
61
+ line = line.sub( /#.*/, '' ).strip
62
+ pp line if debug?
63
+
64
+ ## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
65
+ next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
66
+
67
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
68
+ if m=HEADING_RE.match( line )
69
+ start_para = true
70
+
71
+ heading_marker = m[:marker]
72
+ heading_level = heading_marker.length ## count number of = for heading level
73
+ heading = m[:text].strip
74
+
75
+ puts "heading #{heading_level} >#{heading}<" if debug?
76
+ outline << [:"h#{heading_level}", heading]
77
+ else ## assume it's a (plain/regular) text line
78
+ if start_para
79
+ outline << [:p, [line]]
80
+ start_para = false
81
+ else
82
+ node = outline[-1] ## get last entry
83
+ if node[0] == :p ## assert it's a p(aragraph) node!!!
84
+ node[1] << line ## add line to p(aragraph)
85
+ else
86
+ puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
87
+ pp node
88
+ exit 1
89
+ end
90
+ end
91
+ end
92
+ end
93
+ outline
94
+ end # method read
95
+ end # class OutlineReader
96
+
97
+ end # module SportDb
@@ -1,12 +1,12 @@
1
- module SportDb
1
+ module SportDb
2
2
  class Parser
3
-
4
-
3
+
4
+
5
5
  ## note - do NOT allow single alpha text for now
6
- ## add later?? A - B C - D - why?
6
+ ## add later?? A - B C - D - why?
7
7
  ## opt 1) one alpha
8
- ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
-
8
+ ## (?<text_i> [a-z]) # only allow single letter text (not numbers!!)
9
+
10
10
  ## opt 2) more than one alphanum
11
11
 
12
12
 
@@ -26,19 +26,19 @@ class Parser
26
26
 
27
27
 
28
28
  TEXT_RE = %r{
29
- ## must start with alpha (allow unicode letters!!)
30
- (?<text>
31
- ## positive lookbehind
29
+ ## must start with alpha (allow unicode letters!!)
30
+ (?<text>
31
+ ## positive lookbehind
32
32
  ## (MUST be fixed number of chars - no quantifier e.g. +? etc.)
33
33
  (?<=[ ,;@|\[\]]
34
34
  |^
35
35
  )
36
- (?:
36
+ (?:
37
37
  # opt 1 - start with alpha
38
38
  \p{L}+ ## all unicode letters (e.g. [a-z])
39
39
  |
40
40
 
41
- # opt 2 - start with num!! - allow special case (e.g. 1. FC)
41
+ # opt 2 - start with num!! - allow special case (e.g. 1. FC)
42
42
  \d+ # check for num lookahead (MUST be space or dot)
43
43
  ## MUST be followed by (optional dot) and
44
44
  ## required space !!!
@@ -46,69 +46,79 @@ TEXT_RE = %r{
46
46
  \.? ## optional dot
47
47
  [ ]? ## make space optional too - why? why not?
48
48
  ## yes - eg. 1st, 2nd, 5th etc.
49
- \p{L}+
49
+ \p{L}+
50
50
  )
51
-
51
+
52
52
  (?:(?: (?:[ ]
53
53
  (?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
54
- )
54
+ )
55
55
  | # only single spaces allowed inline!!!
56
- [-]
56
+ [-]
57
57
  )?
58
58
  (?:
59
59
  \p{L} |
60
- [&/']
60
+ [&/']
61
61
  |
62
62
  (?:
63
- \d+
64
- (?![0-9.:h'/+-])
63
+ \d+
64
+ (?![0-9.:h'/+-])
65
65
  ## negative lookahead for numbers
66
66
  ## note - include digits itself!!!
67
- )|
68
- \.
69
- )
67
+ )|
68
+ \.
69
+ )
70
70
  )* ## must NOT end with space or dash(-)
71
71
  ## todo/fix - possible in regex here
72
72
  ## only end in alphanum a-z0-9 (not dot or & ???)
73
73
 
74
-
74
+
75
75
  ## allow optional at the end
76
76
  ## tag or year
77
- ## make it and in the future - why? why not?
78
- ##
77
+ ## make it and in the future - why? why not?
78
+ ##
79
+ ## change - fix
80
+ ## do NOT use (A) for amateur
81
+ ## use A or A. with NO ()!!!
79
82
  ## (A) - allow with predined alpha only for now
80
83
  ## e.g. (A) - amateur a team or b?
84
+ ### same for U21 or U9 etc
85
+ ## use with NO ()!!! - why? why not?
81
86
  ## or U21 U9 etc. - why? why not?
82
87
  ## or etc.
83
88
  ## (1879-1893) or allow years e.g. (1879-1893)
84
- ###
85
- (?:
86
- [ ]
87
- \( (?:
88
- A|B|
89
- U\d{1,2}
90
- )
91
- \)
92
- )?
89
+ ###
90
+ ## add allow country code three to five letters for now
91
+ ## change to generic 1 to 5 - why? why not?
92
+ ## e.g. (A), (I),
93
+ ## (AUT)
94
+ ## (TRNC) five? for UEFA code for northern cyprus
95
+ ## change to 1 to 4 - why? why not?
96
+ ## check - fix possible for upper case only here
97
+ ## inline for this group only?
93
98
  (?:
94
- [ ]
99
+ [ ]
95
100
  \(
96
101
  \d{4}-\d{4}
97
102
  \)
98
- )?
99
-
103
+ )?
104
+ (?:
105
+ [ ]+ ## allow more than once space - why? why not?
106
+ \( (?:
107
+ [A-Z]{1,5}
108
+ )
109
+ \)
110
+ )?
100
111
  ## add lookahead/lookbehind
101
- ## must be space!!!
112
+ ## must be space!!!
102
113
  ## (or comma or start/end of string)
103
114
  ## kind of \b !!!
104
115
  ## positive lookahead
105
116
  (?=[ ,;@|\[\]]
106
117
  |$
107
118
  )
108
- )
119
+ )
109
120
  }ix
110
121
 
111
122
 
112
123
  end # class Parser
113
- end # module SportDb
114
-
124
+ end # module SportDb
@@ -3,8 +3,8 @@ module SportDb
3
3
  module Module
4
4
  module Parser
5
5
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
6
- MINOR = 2
7
- PATCH = 2
6
+ MINOR = 3
7
+ PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -24,6 +24,11 @@ require_relative 'parser/lang'
24
24
  require_relative 'parser/parser'
25
25
 
26
26
 
27
+ ####
28
+ ## todo/check - move outline reader upstream to cocos - why? why not?
29
+ ## use read_outline(), parse_outline() - why? why not?
30
+ require_relative 'parser/outline_reader'
31
+ require_relative 'parser/linter'
27
32
 
28
33
  ###
29
34
  # make parser api (easily) available - why? why not?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-27 00:00:00.000000000 Z
11
+ date: 2024-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cocos
@@ -74,7 +74,8 @@ dependencies:
74
74
  version: '4.1'
75
75
  description: sportdb-parser - football.txt match parser (& tokenizer)
76
76
  email: gerald.bauer@gmail.com
77
- executables: []
77
+ executables:
78
+ - fbtok
78
79
  extensions: []
79
80
  extra_rdoc_files:
80
81
  - CHANGELOG.md
@@ -85,8 +86,11 @@ files:
85
86
  - Manifest.txt
86
87
  - README.md
87
88
  - Rakefile
89
+ - bin/fbtok
88
90
  - lib/sportdb/parser.rb
89
91
  - lib/sportdb/parser/lang.rb
92
+ - lib/sportdb/parser/linter.rb
93
+ - lib/sportdb/parser/outline_reader.rb
90
94
  - lib/sportdb/parser/parser.rb
91
95
  - lib/sportdb/parser/token-date.rb
92
96
  - lib/sportdb/parser/token-score.rb