sportdb-parser 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -4
- data/Rakefile +1 -1
- data/lib/sportdb/parser/lang.rb +24 -7
- data/lib/sportdb/parser/token-date.rb +128 -21
- data/lib/sportdb/parser/version.rb +1 -1
- data/lib/sportdb/parser.rb +0 -5
- metadata +4 -9
- data/bin/fbt +0 -94
- data/lib/sportdb/parser/linter.rb +0 -149
- data/lib/sportdb/parser/opts.rb +0 -70
- data/lib/sportdb/parser/outline_reader.rb +0 -97
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c9225b21f400b9f9cced2052c3062f41a091ed81d3d4239164c9652f53ebc6e
|
4
|
+
data.tar.gz: f7250eaa21324962df27e7cdd397857afa570c610f00c80c31e5105e40964002
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 471c938c233d8f81d7a0fd5e4470a27a52486906764816b6c35ea3d88e19650c81302fd5ff9ee30b85d3a8e9f81ada8eef20b49bd3de924c7238acb106ba6082
|
7
|
+
data.tar.gz: 24d1cf3846404859ad7e751895325b256321d43e2881413fda6325c744ca0c31b52ef2032a9dfc8e56e67d7a06df54a6d2780a297982440b8e40b7055fe06c26
|
data/CHANGELOG.md
CHANGED
data/Manifest.txt
CHANGED
@@ -2,12 +2,8 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
-
bin/fbt
|
6
5
|
lib/sportdb/parser.rb
|
7
6
|
lib/sportdb/parser/lang.rb
|
8
|
-
lib/sportdb/parser/linter.rb
|
9
|
-
lib/sportdb/parser/opts.rb
|
10
|
-
lib/sportdb/parser/outline_reader.rb
|
11
7
|
lib/sportdb/parser/parser.rb
|
12
8
|
lib/sportdb/parser/token-date.rb
|
13
9
|
lib/sportdb/parser/token-score.rb
|
data/Rakefile
CHANGED
data/lib/sportdb/parser/lang.rb
CHANGED
@@ -27,6 +27,12 @@ end
|
|
27
27
|
|
28
28
|
ROUND_RE = %r{^(
|
29
29
|
|
30
|
+
## add special case for group play-off rounds!
|
31
|
+
## group 2 play-off (e.g. worldcup 1954, 1958)
|
32
|
+
(?: Group [ ] [a-z0-9]+ [ ]
|
33
|
+
Play-?offs?
|
34
|
+
)
|
35
|
+
|
|
30
36
|
# round - note - requiers number e.g. round 1,2, etc.
|
31
37
|
# note - use 1-9 regex (cannot start with 0) - why? why not?
|
32
38
|
# make week 01 or round 01 or matchday 01 possible?
|
@@ -46,17 +52,23 @@ ROUND_RE = %r{^(
|
|
46
52
|
|
|
47
53
|
## 1. Round / 2. Round / 3. Round / etc.
|
48
54
|
## Play-off Round
|
55
|
+
## First Round
|
56
|
+
## Final Round (e.g. Worldcup 1950)
|
49
57
|
(?:
|
50
|
-
(?: [1-9][0-9]* \.
|
51
|
-
|
|
52
|
-
|
58
|
+
(?: [1-9][0-9]* \. |
|
59
|
+
Play-?off |
|
60
|
+
1st | First |
|
61
|
+
2nd | Second |
|
62
|
+
Final
|
53
63
|
)
|
54
64
|
[ ] Round
|
55
65
|
)
|
56
66
|
|
|
57
67
|
## starting with preliminary
|
68
|
+
# e.g. Preliminary round
|
58
69
|
(?: Preliminary [ ]
|
59
|
-
(?:
|
70
|
+
(?: Round |
|
71
|
+
Semi-?finals |
|
60
72
|
Final
|
61
73
|
)
|
62
74
|
)
|
@@ -110,10 +122,15 @@ ROUND_RE = %r{^(
|
|
110
122
|
Finals?
|
111
123
|
|
|
112
124
|
## add replays
|
113
|
-
## Final Replay
|
125
|
+
## e.g. Final Replay
|
126
|
+
## Quarter-finals replays
|
127
|
+
## First round replays
|
114
128
|
(?:
|
115
|
-
|
116
|
-
|
129
|
+
(?: First [ ] Round |
|
130
|
+
Quarter-?finals? |
|
131
|
+
Finals?
|
132
|
+
)
|
133
|
+
[ ] Replays?
|
117
134
|
)
|
118
135
|
)$}ix
|
119
136
|
|
@@ -1,6 +1,6 @@
|
|
1
|
-
module SportDb
|
1
|
+
module SportDb
|
2
2
|
class Parser
|
3
|
-
|
3
|
+
|
4
4
|
|
5
5
|
|
6
6
|
def self.parse_names( txt )
|
@@ -47,8 +47,8 @@ def self.build_map( lines, downcase: false )
|
|
47
47
|
## "may" => 5,
|
48
48
|
## "june" => 6, "jun" => 6, ...
|
49
49
|
lines.each_with_index.reduce( {} ) do |h,(line,i)|
|
50
|
-
line.each do |name|
|
51
|
-
h[ downcase ? name.downcase : name ] = i+1
|
50
|
+
line.each do |name|
|
51
|
+
h[ downcase ? name.downcase : name ] = i+1
|
52
52
|
end ## note: start mapping with 1 (and NOT zero-based, that is, 0)
|
53
53
|
h
|
54
54
|
end
|
@@ -109,28 +109,85 @@ DAY_MAP = build_map( DAY_LINES, downcase: true )
|
|
109
109
|
## todo - add more date variants !!!!
|
110
110
|
|
111
111
|
# e.g. Fri Aug/9 or Fri Aug 9
|
112
|
-
|
112
|
+
DATE_I_RE = %r{
|
113
113
|
(?<date>
|
114
114
|
\b
|
115
115
|
## optional day name
|
116
116
|
((?<day_name>#{DAY_NAMES})
|
117
117
|
[ ]
|
118
|
-
)?
|
118
|
+
)?
|
119
119
|
(?<month_name>#{MONTH_NAMES})
|
120
120
|
(?: \/|[ ] )
|
121
121
|
(?<day>\d{1,2})
|
122
122
|
## optional year
|
123
123
|
( [ ]
|
124
124
|
(?<year>\d{4})
|
125
|
-
)?
|
126
|
-
\b
|
125
|
+
)?
|
126
|
+
\b
|
127
127
|
)}ix
|
128
128
|
|
129
129
|
|
130
|
+
# e.g. 3 June or 10 June
|
131
|
+
DATE_II_RE = %r{
|
132
|
+
(?<date>
|
133
|
+
\b
|
134
|
+
## optional day name
|
135
|
+
((?<day_name>#{DAY_NAMES})
|
136
|
+
[ ]
|
137
|
+
)?
|
138
|
+
(?<day>\d{1,2})
|
139
|
+
[ ]
|
140
|
+
(?<month_name>#{MONTH_NAMES})
|
141
|
+
## optional year
|
142
|
+
( [ ]
|
143
|
+
(?<year>\d{4})
|
144
|
+
)?
|
145
|
+
\b
|
146
|
+
)}ix
|
147
|
+
|
148
|
+
|
149
|
+
#############################################
|
150
|
+
# map tables
|
151
|
+
# note: order matters; first come-first matched/served
|
152
|
+
DATE_RE = Regexp.union(
|
153
|
+
DATE_I_RE,
|
154
|
+
DATE_II_RE
|
155
|
+
)
|
156
|
+
|
157
|
+
|
158
|
+
##
|
159
|
+
## add a date parser helper
|
160
|
+
def self.parse_date( str, start: )
|
161
|
+
if m=DATE_RE.match( str )
|
162
|
+
|
163
|
+
year = m[:year].to_i(10) if m[:year]
|
164
|
+
month = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name]
|
165
|
+
day = m[:day].to_i(10) if m[:day]
|
166
|
+
wday = DAY_MAP[ m[:day_name].downcase ] if m[:day_name]
|
167
|
+
|
168
|
+
if year.nil? ## try to calculate year
|
169
|
+
year = if month > start.month ||
|
170
|
+
(month == start.month && day >= start.day)
|
171
|
+
# assume same year as start_at event (e.g. 2013 for 2013/14 season)
|
172
|
+
start.year
|
173
|
+
else
|
174
|
+
# assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
|
175
|
+
start.year+1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
Date.new( year,month,day )
|
179
|
+
else
|
180
|
+
puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
|
181
|
+
exit 1
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
|
186
|
+
|
130
187
|
###
|
131
|
-
# date duration
|
188
|
+
# date duration
|
132
189
|
# use - or + as separator
|
133
|
-
# in theory plus( +) only if dates
|
190
|
+
# in theory plus( +) only if dates
|
134
191
|
# are two days next to each other
|
135
192
|
#
|
136
193
|
# otherwise define new dates type in the future? why? why not?
|
@@ -147,7 +204,7 @@ DATE_RE = %r{
|
|
147
204
|
# Jun/25 .. 26 - why? why not???
|
148
205
|
# Jun/25 to 26 - why? why not???
|
149
206
|
# Jun/25 + 26 - add - why? why not???
|
150
|
-
# Sun-Wed Jun/23-26 - add - why? why not???
|
207
|
+
# Sun-Wed Jun/23-26 - add - why? why not???
|
151
208
|
# Wed+Thu Jun/26+27 2024 - add - why? why not???
|
152
209
|
#
|
153
210
|
# maybe use comman and plus for list of dates
|
@@ -157,39 +214,89 @@ DATE_RE = %r{
|
|
157
214
|
# add back optional comma (before) year - why? why not?
|
158
215
|
|
159
216
|
|
160
|
-
|
217
|
+
##
|
218
|
+
# todo add plus later on - why? why not?
|
219
|
+
|
220
|
+
DURATION_I_RE = %r{
|
161
221
|
(?<duration>
|
162
222
|
\b
|
163
223
|
## optional day name
|
164
224
|
((?<day_name1>#{DAY_NAMES})
|
165
225
|
[ ]
|
166
|
-
)?
|
226
|
+
)?
|
167
227
|
(?<month_name1>#{MONTH_NAMES})
|
168
228
|
(?: \/|[ ] )
|
169
229
|
(?<day1>\d{1,2})
|
170
230
|
## optional year
|
171
231
|
( [ ]
|
172
232
|
(?<year1>\d{4})
|
173
|
-
)?
|
233
|
+
)?
|
174
234
|
|
175
235
|
## support + and - (add .. or such - why??)
|
176
|
-
[ ]*[
|
177
|
-
|
236
|
+
[ ]*[-][ ]*
|
237
|
+
|
178
238
|
## optional day name
|
179
239
|
((?<day_name2>#{DAY_NAMES})
|
180
240
|
[ ]
|
181
|
-
)?
|
241
|
+
)?
|
182
242
|
(?<month_name2>#{MONTH_NAMES})
|
183
243
|
(?: \/|[ ] )
|
184
244
|
(?<day2>\d{1,2})
|
185
245
|
## optional year
|
186
246
|
( [ ]
|
187
247
|
(?<year2>\d{4})
|
188
|
-
)?
|
189
|
-
\b
|
248
|
+
)?
|
249
|
+
\b
|
250
|
+
)}ix
|
251
|
+
|
252
|
+
|
253
|
+
###
|
254
|
+
# variant ii
|
255
|
+
# e.g. 26 July - 27 July
|
256
|
+
|
257
|
+
DURATION_II_RE = %r{
|
258
|
+
(?<duration>
|
259
|
+
\b
|
260
|
+
## optional day name
|
261
|
+
((?<day_name1>#{DAY_NAMES})
|
262
|
+
[ ]
|
263
|
+
)?
|
264
|
+
(?<day1>\d{1,2})
|
265
|
+
[ ]
|
266
|
+
(?<month_name1>#{MONTH_NAMES})
|
267
|
+
## optional year
|
268
|
+
( [ ]
|
269
|
+
(?<year1>\d{4})
|
270
|
+
)?
|
271
|
+
|
272
|
+
## support + and - (add .. or such - why??)
|
273
|
+
[ ]*[-][ ]*
|
274
|
+
|
275
|
+
## optional day name
|
276
|
+
((?<day_name2>#{DAY_NAMES})
|
277
|
+
[ ]
|
278
|
+
)?
|
279
|
+
(?<day2>\d{1,2})
|
280
|
+
[ ]
|
281
|
+
(?<month_name2>#{MONTH_NAMES})
|
282
|
+
## optional year
|
283
|
+
( [ ]
|
284
|
+
(?<year2>\d{4})
|
285
|
+
)?
|
286
|
+
\b
|
190
287
|
)}ix
|
191
288
|
|
192
289
|
|
290
|
+
#############################################
|
291
|
+
# map tables
|
292
|
+
# note: order matters; first come-first matched/served
|
293
|
+
DURATION_RE = Regexp.union(
|
294
|
+
DURATION_I_RE,
|
295
|
+
DURATION_II_RE
|
296
|
+
)
|
297
|
+
|
298
|
+
|
299
|
+
|
193
300
|
end # class Parser
|
194
|
-
end # module SportDb
|
195
|
-
|
301
|
+
end # module SportDb
|
302
|
+
|
data/lib/sportdb/parser.rb
CHANGED
@@ -24,11 +24,6 @@ require_relative 'parser/lang'
|
|
24
24
|
require_relative 'parser/parser'
|
25
25
|
|
26
26
|
|
27
|
-
## more
|
28
|
-
require_relative 'parser/outline_reader'
|
29
|
-
require_relative 'parser/linter'
|
30
|
-
require_relative 'parser/opts'
|
31
|
-
|
32
27
|
|
33
28
|
###
|
34
29
|
# make parser api (easily) available - why? why not?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cocos
|
@@ -74,8 +74,7 @@ dependencies:
|
|
74
74
|
version: '4.1'
|
75
75
|
description: sportdb-parser - football.txt match parser (& tokenizer)
|
76
76
|
email: gerald.bauer@gmail.com
|
77
|
-
executables:
|
78
|
-
- fbt
|
77
|
+
executables: []
|
79
78
|
extensions: []
|
80
79
|
extra_rdoc_files:
|
81
80
|
- CHANGELOG.md
|
@@ -86,12 +85,8 @@ files:
|
|
86
85
|
- Manifest.txt
|
87
86
|
- README.md
|
88
87
|
- Rakefile
|
89
|
-
- bin/fbt
|
90
88
|
- lib/sportdb/parser.rb
|
91
89
|
- lib/sportdb/parser/lang.rb
|
92
|
-
- lib/sportdb/parser/linter.rb
|
93
|
-
- lib/sportdb/parser/opts.rb
|
94
|
-
- lib/sportdb/parser/outline_reader.rb
|
95
90
|
- lib/sportdb/parser/parser.rb
|
96
91
|
- lib/sportdb/parser/token-date.rb
|
97
92
|
- lib/sportdb/parser/token-score.rb
|
@@ -112,7 +107,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
107
|
requirements:
|
113
108
|
- - ">="
|
114
109
|
- !ruby/object:Gem::Version
|
115
|
-
version:
|
110
|
+
version: 3.1.0
|
116
111
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
112
|
requirements:
|
118
113
|
- - ">="
|
data/bin/fbt
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
## tip: to test run:
|
4
|
-
## ruby -I ./lib bin/fbt
|
5
|
-
|
6
|
-
## our own code
|
7
|
-
require 'sportdb/parser'
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
require 'optparse'
|
12
|
-
|
13
|
-
##
|
14
|
-
## read textfile
|
15
|
-
## and dump tokens
|
16
|
-
##
|
17
|
-
## fbt ../openfootball/.../euro.txt
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
args = ARGV
|
23
|
-
opts = { debug: false,
|
24
|
-
metal: false }
|
25
|
-
|
26
|
-
parser = OptionParser.new do |parser|
|
27
|
-
parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
28
|
-
|
29
|
-
##
|
30
|
-
## check if git has a offline option?? (use same)
|
31
|
-
## check for other tools - why? why not?
|
32
|
-
|
33
|
-
|
34
|
-
parser.on( "--verbose", "--debug",
|
35
|
-
"turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
|
36
|
-
opts[:debug] = debug
|
37
|
-
end
|
38
|
-
|
39
|
-
parser.on( "--metal",
|
40
|
-
"turn off typed parse tree; show to the metal tokens"+
|
41
|
-
" (default: #{opts[:metal]})" ) do |metal|
|
42
|
-
opts[:metal] = metal
|
43
|
-
end
|
44
|
-
end
|
45
|
-
parser.parse!( args )
|
46
|
-
|
47
|
-
puts "OPTS:"
|
48
|
-
p opts
|
49
|
-
puts "ARGV:"
|
50
|
-
p args
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
paths = if args.empty?
|
57
|
-
[
|
58
|
-
'../../../openfootball/euro/2020--europe/euro.txt',
|
59
|
-
'../../../openfootball/euro/2024--germany/euro.txt',
|
60
|
-
]
|
61
|
-
else
|
62
|
-
## check for directories
|
63
|
-
## and auto-expand
|
64
|
-
|
65
|
-
SportDb::Parser::Opts.expand_args( args )
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
SportDb::Parser::Linter.debug = true if opts[:debug]
|
71
|
-
|
72
|
-
linter = SportDb::Parser::Linter.new
|
73
|
-
|
74
|
-
errors = []
|
75
|
-
|
76
|
-
paths.each_with_index do |path,i|
|
77
|
-
puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
|
78
|
-
linter.read( path, parse: !opts[:metal] )
|
79
|
-
|
80
|
-
errors += linter.errors if linter.errors?
|
81
|
-
end
|
82
|
-
|
83
|
-
if errors.size > 0
|
84
|
-
puts
|
85
|
-
pp errors
|
86
|
-
puts
|
87
|
-
puts "!! #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
|
88
|
-
else
|
89
|
-
puts
|
90
|
-
puts "OK no parse errors found in #{paths.size} datafile(s)"
|
91
|
-
end
|
92
|
-
|
93
|
-
puts "bye"
|
94
|
-
|
@@ -1,149 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
###
|
6
|
-
## note - Linter for now nested inside Parser - keep? why? why not?
|
7
|
-
class Linter
|
8
|
-
|
9
|
-
def self.debug=(value) @@debug = value; end
|
10
|
-
def self.debug?() @@debug ||= false; end ## note: default is FALSE
|
11
|
-
def debug?() self.class.debug?; end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
attr_reader :errors
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@errors = []
|
19
|
-
@parser = Parser.new ## use own parser instance (not shared) - why? why not?
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
def errors?() @errors.size > 0; end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
## note: colon (:) MUST be followed by one (or more) spaces
|
28
|
-
## make sure mon feb 12 18:10 will not match
|
29
|
-
## allow 1. FC Köln etc.
|
30
|
-
## Mainz 05:
|
31
|
-
## limit to 30 chars max
|
32
|
-
## only allow chars incl. intl buut (NOT ()[]/;)
|
33
|
-
##
|
34
|
-
## Group A:
|
35
|
-
## Group B: - remove colon
|
36
|
-
## or lookup first
|
37
|
-
|
38
|
-
ATTRIB_RE = %r{^
|
39
|
-
[ ]*? # slurp leading spaces
|
40
|
-
(?<key>[^:|\]\[()\/; -]
|
41
|
-
[^:|\]\[()\/;]{0,30}
|
42
|
-
)
|
43
|
-
[ ]*? # slurp trailing spaces
|
44
|
-
:[ ]+
|
45
|
-
(?<value>.+)
|
46
|
-
[ ]*? # slurp trailing spaces
|
47
|
-
$
|
48
|
-
}ix
|
49
|
-
|
50
|
-
|
51
|
-
#########
|
52
|
-
## parse - false (default) - tokenize (only)
|
53
|
-
## - true - tokenize & parse
|
54
|
-
def read( path, parse: false )
|
55
|
-
## note: every (new) read call - resets errors list to empty
|
56
|
-
@errors = []
|
57
|
-
|
58
|
-
nodes = OutlineReader.read( path )
|
59
|
-
|
60
|
-
## process nodes
|
61
|
-
h1 = nil
|
62
|
-
orphans = 0 ## track paragraphs's with no heading
|
63
|
-
|
64
|
-
attrib_found = false
|
65
|
-
|
66
|
-
|
67
|
-
nodes.each do |node|
|
68
|
-
type = node[0]
|
69
|
-
|
70
|
-
if type == :h1
|
71
|
-
h1 = node[1] ## get heading text
|
72
|
-
puts
|
73
|
-
puts " = Heading 1 >#{node[1]}<"
|
74
|
-
elsif type == :p
|
75
|
-
|
76
|
-
if h1.nil?
|
77
|
-
orphans += 1 ## only warn once
|
78
|
-
puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
|
79
|
-
next
|
80
|
-
end
|
81
|
-
|
82
|
-
lines = node[1]
|
83
|
-
|
84
|
-
tree = []
|
85
|
-
lines.each_with_index do |line,i|
|
86
|
-
|
87
|
-
if debug?
|
88
|
-
puts
|
89
|
-
puts "line >#{line}<"
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
## skip new (experimental attrib syntax)
|
94
|
-
if attrib_found == false &&
|
95
|
-
ATTRIB_RE.match?( line )
|
96
|
-
## note: check attrib regex AFTER group def e.g.:
|
97
|
-
## Group A:
|
98
|
-
## Group B: etc.
|
99
|
-
## todo/fix - change Group A: to Group A etc.
|
100
|
-
## Group B: to Group B
|
101
|
-
attrib_found = true
|
102
|
-
## logger.debug "skipping key/value line - >#{line}<"
|
103
|
-
next
|
104
|
-
end
|
105
|
-
|
106
|
-
if attrib_found
|
107
|
-
## check if line ends with dot
|
108
|
-
## if not slurp up lines to the next do!!!
|
109
|
-
## logger.debug "skipping key/value line - >#{line}<"
|
110
|
-
attrib_found = false if line.end_with?( '.' )
|
111
|
-
# logger.debug "skipping key/value line (cont.) - >#{line}<"
|
112
|
-
next
|
113
|
-
end
|
114
|
-
|
115
|
-
t, error_messages = if parse
|
116
|
-
@parser.parse_with_errors( line )
|
117
|
-
else
|
118
|
-
@parser.tokenize_with_errors( line )
|
119
|
-
end
|
120
|
-
|
121
|
-
|
122
|
-
if error_messages.size > 0
|
123
|
-
## add to "global" error list
|
124
|
-
## make a triplet tuple (file / msg / line text)
|
125
|
-
error_messages.each do |msg|
|
126
|
-
@errors << [ path,
|
127
|
-
msg,
|
128
|
-
line
|
129
|
-
]
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
pp t if debug?
|
134
|
-
|
135
|
-
tree << t
|
136
|
-
end
|
137
|
-
|
138
|
-
## pp tree
|
139
|
-
else
|
140
|
-
pp node
|
141
|
-
raise ArgumentError, "unsupported (node) type >#{type}<"
|
142
|
-
end
|
143
|
-
end # each node
|
144
|
-
end # read
|
145
|
-
end # class Linter
|
146
|
-
|
147
|
-
|
148
|
-
end # class Parser
|
149
|
-
end # module SportDb
|
data/lib/sportdb/parser/opts.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
class Parser
|
4
|
-
|
5
|
-
###
|
6
|
-
## note - Opts Helpers for now nested inside Parser - keep here? why? why not?
|
7
|
-
class Opts
|
8
|
-
|
9
|
-
SEASON_RE = %r{ (?:
|
10
|
-
\d{4}-\d{2}
|
11
|
-
| \d{4}(--[a-z0-9_-]+)?
|
12
|
-
)
|
13
|
-
}x
|
14
|
-
SEASON = SEASON_RE.source ## "inline" helper for embedding in other regexes - keep? why? why not?
|
15
|
-
|
16
|
-
|
17
|
-
## note: if pattern includes directory add here
|
18
|
-
## (otherwise move to more "generic" datafile) - why? why not?
|
19
|
-
MATCH_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
20
|
-
#{SEASON}
|
21
|
-
/[a-z0-9_-]+\.txt$ ## txt e.g /1-premierleague.txt
|
22
|
-
}x
|
23
|
-
|
24
|
-
|
25
|
-
def self.find( path )
|
26
|
-
datafiles = []
|
27
|
-
|
28
|
-
## note: normalize path - use File.expand_path ??
|
29
|
-
## change all backslash to slash for now
|
30
|
-
## path = path.gsub( "\\", '/' )
|
31
|
-
path = File.expand_path( path )
|
32
|
-
|
33
|
-
## check all txt files
|
34
|
-
## note: incl. files starting with dot (.)) as candidates
|
35
|
-
## (normally excluded with just *)
|
36
|
-
candidates = Dir.glob( "#{path}/**/{*,.*}.txt" )
|
37
|
-
## pp candidates
|
38
|
-
candidates.each do |candidate|
|
39
|
-
datafiles << candidate if MATCH_RE.match( candidate )
|
40
|
-
end
|
41
|
-
|
42
|
-
## pp datafiles
|
43
|
-
datafiles
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
def self.expand_args( args )
|
48
|
-
paths = []
|
49
|
-
|
50
|
-
args.each do |arg|
|
51
|
-
## check if directory
|
52
|
-
if Dir.exist?( arg )
|
53
|
-
datafiles = find( arg )
|
54
|
-
puts
|
55
|
-
puts " found #{datafiles.size} match txt datafiles in #{arg}"
|
56
|
-
pp datafiles
|
57
|
-
paths += datafiles
|
58
|
-
else
|
59
|
-
## assume it's a file
|
60
|
-
paths << arg
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
paths
|
65
|
-
end
|
66
|
-
end # class Opts
|
67
|
-
|
68
|
-
|
69
|
-
end # class Parser
|
70
|
-
end # module SportDb
|
@@ -1,97 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
|
5
|
-
class OutlineReader
|
6
|
-
|
7
|
-
def self.debug=(value) @@debug = value; end
|
8
|
-
def self.debug?() @@debug ||= false; end
|
9
|
-
def debug?() self.class.debug?; end
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
-
txt = File.open( path, 'r:utf-8' ) {|f| f.read }
|
15
|
-
parse( txt )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.parse( txt )
|
19
|
-
new( txt ).parse
|
20
|
-
end
|
21
|
-
|
22
|
-
def initialize( txt )
|
23
|
-
@txt = txt
|
24
|
-
end
|
25
|
-
|
26
|
-
## note: skip "decorative" only heading e.g. ========
|
27
|
-
## todo/check: find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
|
28
|
-
HEADING_BLANK_RE = %r{\A
|
29
|
-
={1,}
|
30
|
-
\z}x
|
31
|
-
|
32
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
33
|
-
HEADING_RE = %r{\A
|
34
|
-
(?<marker>={1,}) ## 1. leading ======
|
35
|
-
[ ]*
|
36
|
-
(?<text>[^=]+) ## 2. text (note: for now no "inline" = allowed)
|
37
|
-
[ ]*
|
38
|
-
=* ## 3. (optional) trailing ====
|
39
|
-
\z}x
|
40
|
-
|
41
|
-
def parse
|
42
|
-
outline=[] ## outline structure
|
43
|
-
start_para = true ## start new para(graph) on new text line?
|
44
|
-
|
45
|
-
@txt.each_line do |line|
|
46
|
-
line = line.strip ## todo/fix: keep leading and trailing spaces - why? why not?
|
47
|
-
|
48
|
-
if line.empty? ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
|
49
|
-
start_para = true
|
50
|
-
next
|
51
|
-
end
|
52
|
-
|
53
|
-
break if line == '__END__'
|
54
|
-
|
55
|
-
next if line.start_with?( '#' ) ## skip comments too
|
56
|
-
## strip inline (until end-of-line) comments too
|
57
|
-
## e.g Eupen | KAS Eupen ## [de]
|
58
|
-
## => Eupen | KAS Eupen
|
59
|
-
## e.g bq Bonaire, BOE # CONCACAF
|
60
|
-
## => bq Bonaire, BOE
|
61
|
-
line = line.sub( /#.*/, '' ).strip
|
62
|
-
pp line if debug?
|
63
|
-
|
64
|
-
## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
|
65
|
-
next if HEADING_BLANK_RE.match( line ) # skip "decorative" only heading e.g. ========
|
66
|
-
|
67
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
68
|
-
if m=HEADING_RE.match( line )
|
69
|
-
start_para = true
|
70
|
-
|
71
|
-
heading_marker = m[:marker]
|
72
|
-
heading_level = heading_marker.length ## count number of = for heading level
|
73
|
-
heading = m[:text].strip
|
74
|
-
|
75
|
-
puts "heading #{heading_level} >#{heading}<" if debug?
|
76
|
-
outline << [:"h#{heading_level}", heading]
|
77
|
-
else ## assume it's a (plain/regular) text line
|
78
|
-
if start_para
|
79
|
-
outline << [:p, [line]]
|
80
|
-
start_para = false
|
81
|
-
else
|
82
|
-
node = outline[-1] ## get last entry
|
83
|
-
if node[0] == :p ## assert it's a p(aragraph) node!!!
|
84
|
-
node[1] << line ## add line to p(aragraph)
|
85
|
-
else
|
86
|
-
puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
|
87
|
-
pp node
|
88
|
-
exit 1
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
outline
|
94
|
-
end # method read
|
95
|
-
end # class OutlineReader
|
96
|
-
|
97
|
-
end # module SportDb
|