sportdb-parser 0.6.20 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +14 -8
- data/Rakefile +1 -1
- data/lib/sportdb/parser/blocktxt.rb +99 -0
- data/lib/sportdb/parser/lexer.rb +958 -395
- data/lib/sportdb/parser/lexer_buffer.rb +97 -0
- data/lib/sportdb/parser/lexer_tty.rb +111 -0
- data/lib/sportdb/parser/parser.rb +1768 -855
- data/lib/sportdb/parser/racc_parser.rb +1 -1
- data/lib/sportdb/parser/racc_tree.rb +327 -41
- data/lib/sportdb/parser/token-date.rb +160 -178
- data/lib/sportdb/parser/token-date_duration.rb +190 -0
- data/lib/sportdb/parser/token-geo.rb +59 -59
- data/lib/sportdb/parser/token-goals.rb +460 -0
- data/lib/sportdb/parser/token-group.rb +43 -0
- data/lib/sportdb/parser/token-note.rb +40 -0
- data/lib/sportdb/parser/token-prop.rb +70 -54
- data/lib/sportdb/parser/token-prop_name.rb +74 -0
- data/lib/sportdb/parser/token-round.rb +102 -0
- data/lib/sportdb/parser/token-score.rb +323 -47
- data/lib/sportdb/parser/token-score_fuller.rb +435 -0
- data/lib/sportdb/parser/token-score_legs.rb +59 -0
- data/lib/sportdb/parser/token-status.rb +157 -160
- data/lib/sportdb/parser/token-table.rb +149 -0
- data/lib/sportdb/parser/token-text.rb +72 -23
- data/lib/sportdb/parser/token-time.rb +141 -0
- data/lib/sportdb/parser/token.rb +242 -105
- data/lib/sportdb/parser/token_helpers.rb +92 -0
- data/lib/sportdb/parser/version.rb +2 -2
- data/lib/sportdb/parser.rb +24 -2
- metadata +18 -18
- data/config/rounds_de.txt +0 -125
- data/config/rounds_en.txt +0 -29
- data/config/rounds_es.txt +0 -26
- data/config/rounds_misc.txt +0 -25
- data/config/rounds_pt.txt +0 -4
- data/config/zones_en.txt +0 -20
- data/lib/sportdb/parser/lang.rb +0 -298
- data/lib/sportdb/parser/token-minute.rb +0 -205
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
|
|
2
|
+
module SportDb
|
|
3
|
+
|
|
4
|
+
## note - Tokens was placed inside Lexer - keep "top-level" for now inside SportDb
|
|
5
|
+
## for easier reuse with (new) lexer variants!!
|
|
6
|
+
|
|
7
|
+
## transforms
|
|
8
|
+
##
|
|
9
|
+
## Netherlands 1-2 (1-1) England
|
|
10
|
+
## => text => team
|
|
11
|
+
## score|vs
|
|
12
|
+
## text => team
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## token iter/find better name
|
|
17
|
+
## e.g. TokenBuffer/Scanner or such ??
|
|
18
|
+
class Tokens
|
|
19
|
+
def initialize( tokens )
|
|
20
|
+
@tokens = tokens
|
|
21
|
+
@pos = 0
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def pos() @pos; end
|
|
25
|
+
def eos?() @pos >= @tokens.size; end
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def include?( *types )
|
|
29
|
+
pos = @pos
|
|
30
|
+
## puts " starting include? #{types.inspect} @ #{pos}"
|
|
31
|
+
while pos < @tokens.size do
|
|
32
|
+
return true if types.include?( @tokens[pos][0] )
|
|
33
|
+
pos +=1
|
|
34
|
+
end
|
|
35
|
+
false
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
## pattern e.g. [:TEXT, [:VS,:SCORE], :TEXT]
|
|
39
|
+
def match?( *pattern )
|
|
40
|
+
## puts " starting match? #{pattern.inspect} @ #{@pos}"
|
|
41
|
+
pattern.each_with_index do |types,offset|
|
|
42
|
+
## if single symbol wrap in array
|
|
43
|
+
types = types.is_a?(Array) ? types : [types]
|
|
44
|
+
return false unless types.include?( peek(offset) )
|
|
45
|
+
end
|
|
46
|
+
true
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## return token type (e.g. :TEXT, :NUM, etc.)
|
|
51
|
+
def cur() peek(0); end
|
|
52
|
+
## return content (assumed to be text)
|
|
53
|
+
def text(offset=0)
|
|
54
|
+
## raise error - why? why not?
|
|
55
|
+
## return nil?
|
|
56
|
+
if peek( offset ) != :text
|
|
57
|
+
raise ArgumentError, "text(#{offset}) - token not a text type"
|
|
58
|
+
end
|
|
59
|
+
@tokens[@pos+offset][1]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def peek(offset=1)
|
|
64
|
+
## return nil if eos
|
|
65
|
+
if @pos+offset >= @tokens.size
|
|
66
|
+
nil
|
|
67
|
+
else
|
|
68
|
+
@tokens[@pos+offset][0]
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
## note - returns complete token
|
|
73
|
+
def next
|
|
74
|
+
# if @pos >= @tokens.size
|
|
75
|
+
# raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
|
|
76
|
+
# end
|
|
77
|
+
# throw (standard) end of iteration here why? why not?
|
|
78
|
+
|
|
79
|
+
t = @tokens[@pos]
|
|
80
|
+
@pos += 1
|
|
81
|
+
t
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def collect( &blk )
|
|
85
|
+
tokens = []
|
|
86
|
+
loop do
|
|
87
|
+
break if eos?
|
|
88
|
+
tokens << if block_given?
|
|
89
|
+
blk.call( self.next )
|
|
90
|
+
else
|
|
91
|
+
self.next
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
tokens
|
|
95
|
+
end
|
|
96
|
+
end # class Tokens
|
|
97
|
+
end # module SportDb
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
module SportDb
|
|
2
|
+
class Lexer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
########
|
|
6
|
+
## experimental teletype mode
|
|
7
|
+
## only space, A-Z and 0-9 allowed
|
|
8
|
+
IS_TTY_LINE_RE = %r{ \A
|
|
9
|
+
## note - use NEGATIVE lookahead to exclude blank lines
|
|
10
|
+
(?! [ ]*\z)
|
|
11
|
+
|
|
12
|
+
[A-Z0-9 ]+
|
|
13
|
+
\z
|
|
14
|
+
}x
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
TTY_SPACES_RE = %r{ (?<spaces> [ ]{2,}) |
|
|
18
|
+
(?<space> [ ])
|
|
19
|
+
}x
|
|
20
|
+
TTY_NUM_RE = %r{ \b (?<num> \d+ ) \b
|
|
21
|
+
}x
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
## note - TEXT for now allows A, 1A, A1, A1A, A1 B1 C1,
|
|
25
|
+
## A1AA1 2B22 3C33
|
|
26
|
+
## - single space only for concat
|
|
27
|
+
## text segments MUST NOT be all numbers e.g. 1, 11, etc.
|
|
28
|
+
TTY_TEXT_RE = %r{ \b (?<text>
|
|
29
|
+
(?:
|
|
30
|
+
[A-Z] ## MUST start with letter
|
|
31
|
+
|
|
|
32
|
+
[0-9]+[A-Z] ## or numbers followed by letter
|
|
33
|
+
)
|
|
34
|
+
[0-9A-Z]*
|
|
35
|
+
(?:
|
|
36
|
+
### allow move segements separated
|
|
37
|
+
## by single space
|
|
38
|
+
[ ]
|
|
39
|
+
(?:
|
|
40
|
+
[A-Z] ## MUST start with letter
|
|
41
|
+
|
|
|
42
|
+
[0-9]+[A-Z] ## or numbers followed by letter
|
|
43
|
+
)
|
|
44
|
+
[0-9A-Z]*
|
|
45
|
+
)*
|
|
46
|
+
)
|
|
47
|
+
\b
|
|
48
|
+
}x
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
TTY_RE = Regexp.union(
|
|
52
|
+
TTY_SPACES_RE,
|
|
53
|
+
TTY_TEXT_RE,
|
|
54
|
+
TTY_NUM_RE,
|
|
55
|
+
## fix add ANY_RE,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _tokenize_tty_line( line )
|
|
60
|
+
line = line.strip
|
|
61
|
+
|
|
62
|
+
tokens = []
|
|
63
|
+
|
|
64
|
+
## track last offsets - to report error on no match
|
|
65
|
+
## or no match in end of string
|
|
66
|
+
offsets = [0,0]
|
|
67
|
+
pos = 0
|
|
68
|
+
m = nil
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
while m = TTY_RE.match( line, pos )
|
|
72
|
+
offsets = [m.begin(0), m.end(0)]
|
|
73
|
+
|
|
74
|
+
if offsets[0] != pos
|
|
75
|
+
## match NOT starting at start/begin position!!!
|
|
76
|
+
## report parse error!!!
|
|
77
|
+
msg = "!! WARN - tokenize (tty) error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
|
|
78
|
+
puts msg
|
|
79
|
+
log( msg )
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
pos = offsets[1]
|
|
83
|
+
|
|
84
|
+
t = if m[:spaces] || m[:space]
|
|
85
|
+
nil ## skip spaces
|
|
86
|
+
elsif m[:text]
|
|
87
|
+
[:TTY_TEXT, m[:text]]
|
|
88
|
+
elsif m[:num]
|
|
89
|
+
[:TTY_NUM, m[:num].to_i(10)]
|
|
90
|
+
else
|
|
91
|
+
## report error/raise expection
|
|
92
|
+
puts "!!! TTY TOKENIZE ERROR - no match found"
|
|
93
|
+
nil
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
tokens << t if t
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
## check if no match in end of string
|
|
100
|
+
if offsets[1] != line.size
|
|
101
|
+
msg = "!! WARN - tokenize (tty) error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
|
|
102
|
+
puts msg
|
|
103
|
+
log( msg )
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
tokens
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
end # class Lexer
|
|
110
|
+
end # module SportDb
|
|
111
|
+
|