sportdb-formats 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +11 -0
- data/Rakefile +1 -0
- data/config/fixtures/de.yml +46 -0
- data/config/fixtures/en.yml +54 -0
- data/config/fixtures/es.yml +48 -0
- data/config/fixtures/fr.yml +53 -0
- data/config/fixtures/it.yml +55 -0
- data/config/fixtures/pt.yml +46 -0
- data/config/fixtures/ro.yml +55 -0
- data/lib/sportdb/formats/datafile.rb +10 -4
- data/lib/sportdb/formats/date.rb +446 -0
- data/lib/sportdb/formats/lang.rb +216 -0
- data/lib/sportdb/formats/version.rb +6 -1
- data/lib/sportdb/formats.rb +17 -5
- data/test/test_datafile_match.rb +4 -4
- data/test/test_date.rb +100 -0
- data/test/test_lang.rb +130 -0
- metadata +27 -2
@@ -0,0 +1,446 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#### fix: move to textutils for reuse !!!!!!!!!! - why?? why not ??
|
4
|
+
|
5
|
+
|
6
|
+
module SportDb
|
7
|
+
|
8
|
+
|
9
|
+
class DateFinderBase
|
10
|
+
|
11
|
+
MONTH_EN_TO_MM = {
|
12
|
+
'Jan' => '1', 'January' => '1',
|
13
|
+
'Feb' => '2', 'February' => '2',
|
14
|
+
'Mar' => '3', 'March' => '3',
|
15
|
+
'Apr' => '4', 'April' => '4',
|
16
|
+
'May' => '5',
|
17
|
+
'Jun' => '6', 'June' => '6',
|
18
|
+
'Jul' => '7', 'July' => '7',
|
19
|
+
'Aug' => '8', 'August' => '8',
|
20
|
+
'Sep' => '9', 'Sept' => '9', 'September' => '9',
|
21
|
+
'Oct' => '10', 'October' => '10',
|
22
|
+
'Nov' => '11', 'November' => '11',
|
23
|
+
'Dec' => '12', 'December' =>'12' }
|
24
|
+
|
25
|
+
MONTH_FR_TO_MM = {
|
26
|
+
'Janvier' => '1', 'Janv' => '1', 'Jan' => '1', ## check janv in use??
|
27
|
+
'Février' => '2', 'Févr' => '2', 'Fév' => '2', ## check fevr in use???
|
28
|
+
'Mars' => '3', 'Mar' => '3',
|
29
|
+
'Avril' => '4', 'Avri' => '4', 'Avr' => '4', ## check avri in use??? if not remove
|
30
|
+
'Mai' => '5',
|
31
|
+
'Juin' => '6',
|
32
|
+
'Juillet' => '7', 'Juil' => '7',
|
33
|
+
'Août' => '8',
|
34
|
+
'Septembre' => '9', 'Sept' => '9',
|
35
|
+
'Octobre' => '10', 'Octo' => '10', 'Oct' => '10', ### check octo in use??
|
36
|
+
'Novembre' => '11', 'Nove' => '11', 'Nov' => '11', ## check nove in use??
|
37
|
+
'Décembre' => '12', 'Déce' => '12', 'Déc' => '12' } ## check dece in use??
|
38
|
+
|
39
|
+
MONTH_ES_TO_MM = {
|
40
|
+
'Ene' => '1', 'Enero' => '1',
|
41
|
+
'Feb' => '2',
|
42
|
+
'Mar' => '3', 'Marzo' => '3',
|
43
|
+
'Abr' => '4', 'Abril' => '4',
|
44
|
+
'May' => '5', 'Mayo' => '5',
|
45
|
+
'Jun' => '6', 'Junio' => '6',
|
46
|
+
'Jul' => '7', 'Julio' => '7',
|
47
|
+
'Ago' => '8', 'Agosto' => '8',
|
48
|
+
'Sep' => '9', 'Set' => '9', 'Sept' => '9',
|
49
|
+
'Oct' => '10',
|
50
|
+
'Nov' => '11',
|
51
|
+
'Dic' => '12' }
|
52
|
+
|
53
|
+
private
|
54
|
+
def calc_year( month, day, start_at: ) ## note: start_at required param for now on!!!
|
55
|
+
|
56
|
+
logger.debug " [calc_year] ????-#{month}-#{day} -- start_at: #{start_at}"
|
57
|
+
|
58
|
+
if month >= start_at.month
|
59
|
+
# assume same year as start_at event (e.g. 2013 for 2013/14 season)
|
60
|
+
start_at.year
|
61
|
+
else
|
62
|
+
# assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
|
63
|
+
start_at.year+1
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
def parse_date_time( match_data, opts={} )
|
69
|
+
|
70
|
+
# convert regex match_data captures to hash
|
71
|
+
# - note: cannont use match_data like a hash (e.g. raises exception if key/name not present/found)
|
72
|
+
h = {}
|
73
|
+
# - note: do NOT forget to turn name into symbol for lookup in new hash (name.to_sym)
|
74
|
+
match_data.names.each { |name| h[name.to_sym] = match_data[name] } # or use match_data.names.zip( match_data.captures ) - more cryptic but "elegant"??
|
75
|
+
|
76
|
+
## puts "[parse_date_time] match_data:"
|
77
|
+
## pp h
|
78
|
+
logger.debug " [parse_date_time] hash: >#{h.inspect}<"
|
79
|
+
|
80
|
+
h[ :month ] = MONTH_EN_TO_MM[ h[:month_en] ] if h[:month_en]
|
81
|
+
h[ :month ] = MONTH_ES_TO_MM[ h[:month_es] ] if h[:month_es]
|
82
|
+
h[ :month ] = MONTH_FR_TO_MM[ h[:month_fr] ] if h[:month_fr]
|
83
|
+
|
84
|
+
month = h[:month]
|
85
|
+
day = h[:day]
|
86
|
+
year = h[:year] || calc_year( month.to_i, day.to_i, opts ).to_s
|
87
|
+
|
88
|
+
hours = h[:hours] || '12' # default to 12:00 for HH:MM (hours:minutes)
|
89
|
+
minutes = h[:minutes] || '00'
|
90
|
+
|
91
|
+
value = '%d-%02d-%02d %02d:%02d' % [year.to_i, month.to_i, day.to_i, hours.to_i, minutes.to_i]
|
92
|
+
logger.debug " date: >#{value}<"
|
93
|
+
|
94
|
+
DateTime.strptime( value, '%Y-%m-%d %H:%M' )
|
95
|
+
end
|
96
|
+
|
97
|
+
end # class DateFinderBase
|
98
|
+
|
99
|
+
|
100
|
+
class DateFinder < DateFinderBase
|
101
|
+
|
102
|
+
include LogUtils::Logging
|
103
|
+
|
104
|
+
# todo: make more generic for reuse
|
105
|
+
### fix:
|
106
|
+
### move to textutils
|
107
|
+
## date/fr.yml en.yml etc. ???
|
108
|
+
## why? why not?
|
109
|
+
|
110
|
+
MONTH_FR = 'Janvier|Janv|Jan|' +
|
111
|
+
'Février|Févr|Fév|' +
|
112
|
+
'Mars|Mar|' +
|
113
|
+
'Avril|Avri|Avr|' +
|
114
|
+
'Mai|' +
|
115
|
+
'Juin|' +
|
116
|
+
'Juillet|Juil|' +
|
117
|
+
'Août|' +
|
118
|
+
'Septembre|Sept|' +
|
119
|
+
'Octobre|Octo|Oct|' +
|
120
|
+
'Novembre|Nove|Nov|' +
|
121
|
+
'Décembre|Déce|Déc'
|
122
|
+
|
123
|
+
WEEKDAY_FR = 'Lundi|Lun|L|' +
|
124
|
+
'Mardi|Mar|Ma|' +
|
125
|
+
'Mercredi|Mer|Me|' +
|
126
|
+
'Jeudi|Jeu|J|' +
|
127
|
+
'Vendredi|Ven|V|' +
|
128
|
+
'Samedi|Sam|S|' +
|
129
|
+
'Dimanche|Dim|D|'
|
130
|
+
|
131
|
+
|
132
|
+
MONTH_EN = 'January|Jan|'+
|
133
|
+
'February|Feb|'+
|
134
|
+
'March|Mar|'+
|
135
|
+
'April|Apr|'+
|
136
|
+
'May|'+
|
137
|
+
'June|Jun|'+
|
138
|
+
'July|Jul|'+
|
139
|
+
'August|Aug|'+
|
140
|
+
'September|Sept|Sep|'+
|
141
|
+
'October|Oct|'+
|
142
|
+
'November|Nov|'+
|
143
|
+
'December|Dec'
|
144
|
+
|
145
|
+
###
|
146
|
+
## todo: add days
|
147
|
+
## 1. Sunday - Sun. 2. Monday - Mon.
|
148
|
+
## 3. Tuesday - Tu., Tue., or Tues. 4. Wednesday - Wed.
|
149
|
+
## 5. Thursday - Th., Thu., Thur., or Thurs. 6. Friday - Fri.
|
150
|
+
## 7. Saturday - Sat.
|
151
|
+
|
152
|
+
|
153
|
+
MONTH_ES = 'Enero|Ene|'+
|
154
|
+
'Feb|'+
|
155
|
+
'Marzo|Mar|'+
|
156
|
+
'Abril|Abr|'+
|
157
|
+
'Mayo|May|'+
|
158
|
+
'Junio|Jun|'+
|
159
|
+
'Julio|Jul|'+
|
160
|
+
'Agosto|Ago|'+
|
161
|
+
'Sept|Set|Sep|'+
|
162
|
+
'Oct|'+
|
163
|
+
'Nov|'+
|
164
|
+
'Dic'
|
165
|
+
|
166
|
+
# todo/fix - add de and es too!!
|
167
|
+
# note: in Austria - Jänner - in Deutschland Januar allow both ??
|
168
|
+
# MONTH_DE = 'J[aä]n|Feb|Mär|Apr|Mai|Jun|Jul|Aug|Sep|Okt|Nov|Dez'
|
169
|
+
|
170
|
+
|
171
|
+
# e.g. 2012-09-14 20:30 => YYYY-MM-DD HH:MM
|
172
|
+
# nb: allow 2012-9-3 7:30 e.g. no leading zero required
|
173
|
+
# regex_db
|
174
|
+
DB__DATE_TIME_REGEX = /\b
|
175
|
+
(?<year>\d{4})
|
176
|
+
-
|
177
|
+
(?<month>\d{1,2})
|
178
|
+
-
|
179
|
+
(?<day>\d{1,2})
|
180
|
+
\s+
|
181
|
+
(?<hours>\d{1,2})
|
182
|
+
:
|
183
|
+
(?<minutes>\d{2})
|
184
|
+
\b/x
|
185
|
+
|
186
|
+
# e.g. 2012-09-14 w/ implied hours (set to 12:00)
|
187
|
+
# nb: allow 2012-9-3 e.g. no leading zero required
|
188
|
+
# regex_db2
|
189
|
+
DB__DATE_REGEX = /\b
|
190
|
+
(?<year>\d{4})
|
191
|
+
-
|
192
|
+
(?<month>\d{1,2})
|
193
|
+
-
|
194
|
+
(?<day>\d{1,2})
|
195
|
+
\b/x
|
196
|
+
|
197
|
+
# e.g. 14.09.2012 20:30 => DD.MM.YYYY HH:MM
|
198
|
+
# nb: allow 2.3.2012 e.g. no leading zero required
|
199
|
+
# nb: allow hour as 20.30
|
200
|
+
# regex_de
|
201
|
+
DD_MM_YYYY__DATE_TIME_REGEX = /\b
|
202
|
+
(?<day>\d{1,2})
|
203
|
+
\.
|
204
|
+
(?<month>\d{1,2})
|
205
|
+
\.
|
206
|
+
(?<year>\d{4})
|
207
|
+
\s+
|
208
|
+
(?<hours>\d{1,2})
|
209
|
+
[:.]
|
210
|
+
(?<minutes>\d{2})
|
211
|
+
\b/x
|
212
|
+
|
213
|
+
# e.g. 14.09. 20:30 => DD.MM. HH:MM
|
214
|
+
# nb: allow 2.3.2012 e.g. no leading zero required
|
215
|
+
# nb: allow hour as 20.30 or 3.30 instead of 03.30
|
216
|
+
# regex_de2
|
217
|
+
DD_MM__DATE_TIME_REGEX = /\b
|
218
|
+
(?<day>\d{1,2})
|
219
|
+
\.
|
220
|
+
(?<month>\d{1,2})
|
221
|
+
\.
|
222
|
+
\s+
|
223
|
+
(?<hours>\d{1,2})
|
224
|
+
[:.]
|
225
|
+
(?<minutes>\d{2})
|
226
|
+
\b/x
|
227
|
+
|
228
|
+
# e.g. 14.09.2012 => DD.MM.YYYY w/ implied hours (set to 12:00)
|
229
|
+
# regex_de3
|
230
|
+
DD_MM_YYYY__DATE_REGEX = /\b
|
231
|
+
(?<day>\d{1,2})
|
232
|
+
\.
|
233
|
+
(?<month>\d{1,2})
|
234
|
+
\.
|
235
|
+
(?<year>\d{4})
|
236
|
+
\b/x
|
237
|
+
|
238
|
+
# e.g. 14.09. => DD.MM. w/ implied year and implied hours (set to 12:00)
|
239
|
+
# note: allow end delimiter ] e.g. [Sa 12.01.] or end-of-string ($) too
|
240
|
+
# note: we use a lookahead for last part e.g. (?:\s+|$|[\]]) - do NOT cosume
|
241
|
+
# regex_de4 (use lookahead assert)
|
242
|
+
DD_MM__DATE_REGEX = /\b
|
243
|
+
(?<day>\d{1,2})
|
244
|
+
\.
|
245
|
+
(?<month>\d{1,2})
|
246
|
+
\.
|
247
|
+
(?=\s+|$|[\]])/x ## note: allow end-of-string/line too
|
248
|
+
|
249
|
+
|
250
|
+
# e.g. 12 May 2013 14:00 => D|DD.MMM.YYYY H|HH:MM
|
251
|
+
EN__DD_MONTH_YYYY__DATE_TIME_REGEX = /\b
|
252
|
+
(?<day>\d{1,2})
|
253
|
+
\s
|
254
|
+
(?<month_en>#{MONTH_EN})
|
255
|
+
\s
|
256
|
+
(?<year>\d{4})
|
257
|
+
\s+
|
258
|
+
(?<hours>\d{1,2})
|
259
|
+
:
|
260
|
+
(?<minutes>\d{2})
|
261
|
+
\b/x
|
262
|
+
|
263
|
+
###
|
264
|
+
# fix: pass in lang (e.g. en or es)
|
265
|
+
# only process format for lang plus fallback to en?
|
266
|
+
# e.g. EN__DD_MONTH and ES__DD_MONTH depend on order for match (first listed will match)
|
267
|
+
|
268
|
+
# e.g. 12 May => D|DD.MMM w/ implied year and implied hours
|
269
|
+
EN__DD_MONTH__DATE_REGEX = /\b
|
270
|
+
(?<day>\d{1,2})
|
271
|
+
\s
|
272
|
+
(?<month_en>#{MONTH_EN})
|
273
|
+
\b/x
|
274
|
+
|
275
|
+
|
276
|
+
# e.g. Jun/12 2011 14:00
|
277
|
+
EN__MONTH_DD_YYYY__DATE_TIME_REGEX = /\b
|
278
|
+
(?<month_en>#{MONTH_EN})
|
279
|
+
\/
|
280
|
+
(?<day>\d{1,2})
|
281
|
+
\s
|
282
|
+
(?<year>\d{4})
|
283
|
+
\s+
|
284
|
+
(?<hours>\d{1,2})
|
285
|
+
:
|
286
|
+
(?<minutes>\d{2})
|
287
|
+
\b/x
|
288
|
+
|
289
|
+
# e.g. Jun/12 14:00 w/ implied year H|HH:MM
|
290
|
+
EN__MONTH_DD__DATE_TIME_REGEX = /\b
|
291
|
+
(?<month_en>#{MONTH_EN})
|
292
|
+
\/
|
293
|
+
(?<day>\d{1,2})
|
294
|
+
\s+
|
295
|
+
(?<hours>\d{1,2})
|
296
|
+
:
|
297
|
+
(?<minutes>\d{2})
|
298
|
+
\b/x
|
299
|
+
|
300
|
+
# e.g. Jun/12 2013 w/ implied hours (set to 12:00)
|
301
|
+
EN__MONTH_DD_YYYY__DATE_REGEX = /\b
|
302
|
+
(?<month_en>#{MONTH_EN})
|
303
|
+
\/
|
304
|
+
(?<day>\d{1,2})
|
305
|
+
\s
|
306
|
+
(?<year>\d{4})
|
307
|
+
\b/x
|
308
|
+
|
309
|
+
# e.g. Jun/12 w/ implied year and implied hours (set to 12:00)
|
310
|
+
# note: allow space too e.g Jun 12 -- check if conflicts w/ other formats??? (added for rsssf reader)
|
311
|
+
# -- fix: might eat french weekday mar 12 is mardi (mar) !!! see FR__ pattern
|
312
|
+
# fix: remove space again for now - and use simple en date reader or something!!!
|
313
|
+
## was [\/ ] changed back to \/
|
314
|
+
EN__MONTH_DD__DATE_REGEX = /\b
|
315
|
+
(?<month_en>#{MONTH_EN})
|
316
|
+
\/
|
317
|
+
(?<day>\d{1,2})
|
318
|
+
\b/x
|
319
|
+
|
320
|
+
|
321
|
+
# e.g. 12 Ene w/ implied year and implied hours (set to 12:00)
|
322
|
+
ES__DD_MONTH__DATE_REGEX = /\b
|
323
|
+
(?<day>\d{1,2})
|
324
|
+
\s
|
325
|
+
(?<month_es>#{MONTH_ES})
|
326
|
+
\b/x
|
327
|
+
|
328
|
+
# e.g. Ven 8 Août or [Ven 8 Août] or Ven 8. Août or [Ven 8. Août]
|
329
|
+
### note: do NOT consume [] in regex (use lookahead assert)
|
330
|
+
FR__WEEKDAY_DD_MONTH__DATE_REGEX = /\b
|
331
|
+
(?:#{WEEKDAY_FR}) # note: skip weekday for now; do NOT capture
|
332
|
+
\s+
|
333
|
+
(?<day>\d{1,2})
|
334
|
+
\.? # note: make dot optional
|
335
|
+
\s+
|
336
|
+
(?<month_fr>#{MONTH_FR})
|
337
|
+
(?=\s+|$|[\]])/x ## note: allow end-of-string/line too
|
338
|
+
|
339
|
+
|
340
|
+
|
341
|
+
|
342
|
+
# map table - 1) tag, 2) regex - note: order matters; first come-first matched/served
|
343
|
+
FORMATS = [
|
344
|
+
[ '[YYYY_MM_DD_hh_mm]', DB__DATE_TIME_REGEX ],
|
345
|
+
[ '[YYYY_MM_DD]', DB__DATE_REGEX ],
|
346
|
+
[ '[DD_MM_YYYY_hh_mm]', DD_MM_YYYY__DATE_TIME_REGEX ],
|
347
|
+
[ '[DD_MM_hh_mm]', DD_MM__DATE_TIME_REGEX ],
|
348
|
+
[ '[DD_MM_YYYY]', DD_MM_YYYY__DATE_REGEX ],
|
349
|
+
[ '[DD_MM]', DD_MM__DATE_REGEX ],
|
350
|
+
[ '[FR_WEEKDAY_DD_MONTH]', FR__WEEKDAY_DD_MONTH__DATE_REGEX ],
|
351
|
+
[ '[EN_DD_MONTH_YYYY_hh_mm]', EN__DD_MONTH_YYYY__DATE_TIME_REGEX ],
|
352
|
+
[ '[EN_MONTH_DD_YYYY_hh_mm]', EN__MONTH_DD_YYYY__DATE_TIME_REGEX ],
|
353
|
+
[ '[EN_MONTH_DD_hh_mm]', EN__MONTH_DD__DATE_TIME_REGEX ],
|
354
|
+
[ '[EN_MONTH_DD_YYYY]', EN__MONTH_DD_YYYY__DATE_REGEX ],
|
355
|
+
[ '[EN_MONTH_DD]', EN__MONTH_DD__DATE_REGEX ],
|
356
|
+
[ '[EN_DD_MONTH]', EN__DD_MONTH__DATE_REGEX ],
|
357
|
+
[ '[ES_DD_MONTH]', ES__DD_MONTH__DATE_REGEX ]
|
358
|
+
]
|
359
|
+
|
360
|
+
|
361
|
+
|
362
|
+
def initialize
|
363
|
+
# nothing here for now
|
364
|
+
end
|
365
|
+
|
366
|
+
def find!( line, opts={} )
|
367
|
+
# fix: use more lookahead for all required trailing spaces!!!!!
|
368
|
+
# fix: use <name capturing group> for month,day,year etc.!!!
|
369
|
+
|
370
|
+
#
|
371
|
+
# fix: !!!!
|
372
|
+
# date in [] will become [[DATE.DE4]] - when getting removed will keep ]!!!!
|
373
|
+
# fix: change regex to \[[A-Z0-9.]\] !!!!!! plus add unit test too!!!
|
374
|
+
#
|
375
|
+
|
376
|
+
md = nil
|
377
|
+
FORMATS.each do |format|
|
378
|
+
tag = format[0]
|
379
|
+
pattern = format[1]
|
380
|
+
md=pattern.match( line )
|
381
|
+
if md
|
382
|
+
date = parse_date_time( md, opts )
|
383
|
+
## fix: use md[0] e.g. match for sub! instead of using regex again - why? why not???
|
384
|
+
## fix: use md.begin(0), md.end(0)
|
385
|
+
line.sub!( md[0], tag )
|
386
|
+
## todo/fix: make sure match data will not get changed (e.g. using sub! before parse_date_time)
|
387
|
+
return date
|
388
|
+
end
|
389
|
+
# no match; continue; try next pattern
|
390
|
+
end
|
391
|
+
|
392
|
+
return nil # no match found
|
393
|
+
end
|
394
|
+
|
395
|
+
end # class DateFinder
|
396
|
+
|
397
|
+
|
398
|
+
class RsssfDateFinder < DateFinderBase
|
399
|
+
|
400
|
+
include LogUtils::Logging
|
401
|
+
|
402
|
+
MONTH_EN = 'Jan|'+
|
403
|
+
'Feb|'+
|
404
|
+
'March|Mar|'+
|
405
|
+
'April|Apr|'+
|
406
|
+
'May|'+
|
407
|
+
'June|Jun|'+
|
408
|
+
'July|Jul|'+
|
409
|
+
'Aug|'+
|
410
|
+
'Sept|Sep|'+
|
411
|
+
'Oct|'+
|
412
|
+
'Nov|'+
|
413
|
+
'Dec'
|
414
|
+
|
415
|
+
## e.g.
|
416
|
+
## [Jun 7] or [Aug 12] etc. - not MUST include brackets e.g. []
|
417
|
+
##
|
418
|
+
## check add \b at the beginning and end - why?? why not?? working??
|
419
|
+
EN__MONTH_DD__DATE_REGEX = /\[
|
420
|
+
(?<month_en>#{MONTH_EN})
|
421
|
+
\s
|
422
|
+
(?<day>\d{1,2})
|
423
|
+
\]/x
|
424
|
+
|
425
|
+
def find!( line, opts={} )
|
426
|
+
# fix: use more lookahead for all required trailing spaces!!!!!
|
427
|
+
# fix: use <name capturing group> for month,day,year etc.!!!
|
428
|
+
|
429
|
+
tag = '[EN_MONTH_DD]'
|
430
|
+
pattern = EN__MONTH_DD__DATE_REGEX
|
431
|
+
md = pattern.match( line )
|
432
|
+
if md
|
433
|
+
date = parse_date_time( md, opts )
|
434
|
+
## fix: use md[0] e.g. match for sub! instead of using regex again - why? why not???
|
435
|
+
## fix: use md.begin(0), md.end(0)
|
436
|
+
line.sub!( md[0], tag )
|
437
|
+
## todo/fix: make sure match data will not get changed (e.g. using sub! before parse_date_time)
|
438
|
+
return date
|
439
|
+
end
|
440
|
+
return nil # no match found
|
441
|
+
end
|
442
|
+
|
443
|
+
|
444
|
+
end ## class RsssfDateFinder
|
445
|
+
|
446
|
+
end # module SportDb
|
@@ -0,0 +1,216 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
|
5
|
+
class Lang
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_reader :lang
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
# fix/todo: load on demand; only if no fixtures loaded/configured use builtin
|
13
|
+
load_builtin_words
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def load_builtin_words
|
18
|
+
builtin_words = {
|
19
|
+
'en' => 'fixtures/en',
|
20
|
+
'de' => 'fixtures/de',
|
21
|
+
'es' => 'fixtures/es',
|
22
|
+
'fr' => 'fixtures/fr',
|
23
|
+
'it' => 'fixtures/it',
|
24
|
+
'pt' => 'fixtures/pt',
|
25
|
+
'ro' => 'fixtures/ro'
|
26
|
+
}
|
27
|
+
|
28
|
+
load_words( builtin_words, SportDb::Formats.config_path )
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def load_words( h, include_path )
|
33
|
+
@lang = 'en' # make default lang english/en
|
34
|
+
@words = {} # resets fixtures
|
35
|
+
@cache = {} # reset cached values
|
36
|
+
|
37
|
+
h.each_with_index do |(key,value),i|
|
38
|
+
name = value
|
39
|
+
path = "#{include_path}/#{name}.yml"
|
40
|
+
logger.debug( "loading words #{key} (#{i+1}/#{h.size}) in '#{name}' (#{path})..." )
|
41
|
+
@words[ key ] = YAML.load( File.read_utf8( path ))
|
42
|
+
end
|
43
|
+
|
44
|
+
@classifier = TextUtils::Classifier.new
|
45
|
+
@words.each_with_index do |(key,value),i|
|
46
|
+
logger.debug "train classifier for #{key} (#{i+1}/#{@words.size})"
|
47
|
+
@classifier.train( key, value )
|
48
|
+
end
|
49
|
+
|
50
|
+
@classifier.dump # for debugging dump all words
|
51
|
+
end
|
52
|
+
|
53
|
+
def classify( text )
|
54
|
+
@classifier.classify( text )
|
55
|
+
end
|
56
|
+
|
57
|
+
def classify_file( path )
|
58
|
+
@classifier.classify_file( path )
|
59
|
+
end
|
60
|
+
|
61
|
+
def lang=(value)
|
62
|
+
logger.debug "setting lang to #{value}"
|
63
|
+
|
64
|
+
if @lang != value
|
65
|
+
|
66
|
+
### todo: make reset cached values into method/function for reuse (see load_words)
|
67
|
+
# reset cached values on language change
|
68
|
+
logger.debug "reseting cached lang values (lang changed from #{@lang} to #{value})"
|
69
|
+
|
70
|
+
@cache = {}
|
71
|
+
end
|
72
|
+
|
73
|
+
@lang = value
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def group
|
79
|
+
@cache[ :group ] ||= group_getter
|
80
|
+
end
|
81
|
+
|
82
|
+
def round
|
83
|
+
@cache[ :round ] ||= round_getter
|
84
|
+
end
|
85
|
+
|
86
|
+
def knockout_round
|
87
|
+
@cache[ :knockout_round ] ||= knockout_round_getter
|
88
|
+
end
|
89
|
+
|
90
|
+
def leg1
|
91
|
+
@cache[ :leg1 ] ||= leg1_getter
|
92
|
+
end
|
93
|
+
|
94
|
+
def leg2
|
95
|
+
@cache[ :leg2 ] ||= leg2_getter
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
def regex_group
|
101
|
+
@cache [ :regex_group ] ||= regex_group_getter
|
102
|
+
end
|
103
|
+
|
104
|
+
def regex_round
|
105
|
+
@cache[ :regex_round ] ||= regex_round_getter
|
106
|
+
end
|
107
|
+
|
108
|
+
def regex_knockout_round
|
109
|
+
@cache[ :regex_knockout_round ] ||= regex_knockout_round_getter
|
110
|
+
end
|
111
|
+
|
112
|
+
def regex_leg1
|
113
|
+
@cache[ :regex_leg1 ] ||= regex_leg1_getter
|
114
|
+
end
|
115
|
+
|
116
|
+
def regex_leg2
|
117
|
+
@cache[ :regex_leg2 ] ||= regex_leg2_getter
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
def group_getter
|
122
|
+
h = @words[ lang ]
|
123
|
+
values = "" # NB: always construct a new string (do NOT use a reference to hash value)
|
124
|
+
values << h['group']
|
125
|
+
values
|
126
|
+
end
|
127
|
+
|
128
|
+
def round_getter
|
129
|
+
# e.g. Spieltag|Runde|Achtelfinale|Viertelfinale|Halbfinale|Finale
|
130
|
+
|
131
|
+
## fix/todo:
|
132
|
+
## sort by length first - to allow best match e.g.
|
133
|
+
## 3rd place play-off instead of Play-off ?? etc. - why? why not?
|
134
|
+
|
135
|
+
h = @words[ lang ]
|
136
|
+
values = "" # NB: always construct a new string (do NOT use a reference to hash value)
|
137
|
+
values << h['round']
|
138
|
+
|
139
|
+
### add knockout rounds values too
|
140
|
+
values << "|" << h['round32']
|
141
|
+
values << "|" << h['round16']
|
142
|
+
values << "|" << h['quarterfinals']
|
143
|
+
values << "|" << h['semifinals']
|
144
|
+
values << "|" << h['fifthplace'] if h['fifthplace'] # nb: allow empty/is optional!!
|
145
|
+
values << "|" << h['thirdplace']
|
146
|
+
values << "|" << h['final']
|
147
|
+
values << "|" << h['playoffs'] if h['playoffs'] # nb: allow empty/is optional!!
|
148
|
+
values
|
149
|
+
end
|
150
|
+
|
151
|
+
def leg1_getter
|
152
|
+
h = @words[ lang ]
|
153
|
+
values = "" # NB: always construct a new string (do NOT use a reference to hash value)
|
154
|
+
values << h['leg1']
|
155
|
+
values
|
156
|
+
end
|
157
|
+
|
158
|
+
def leg2_getter
|
159
|
+
h = @words[ lang ]
|
160
|
+
values = "" # NB: always construct a new string (do NOT use a reference to hash value)
|
161
|
+
values << h['leg2']
|
162
|
+
values
|
163
|
+
end
|
164
|
+
|
165
|
+
def knockout_round_getter
|
166
|
+
h = @words[ lang ]
|
167
|
+
values = "" # NB: always construct a new string (do NOT use a reference to hash value)
|
168
|
+
values << h['round32']
|
169
|
+
values << "|" << h['round16']
|
170
|
+
values << "|" << h['quarterfinals']
|
171
|
+
values << "|" << h['semifinals']
|
172
|
+
values << "|" << h['fifthplace'] if h['fifthplace'] # nb: allow empty/is optional!!
|
173
|
+
values << "|" << h['thirdplace']
|
174
|
+
values << "|" << h['final']
|
175
|
+
values << "|" << h['playoffs'] if h['playoffs'] # nb: allow empty/is optional!!
|
176
|
+
values
|
177
|
+
end
|
178
|
+
|
179
|
+
def regex_group_getter
|
180
|
+
## todo: escape for regex?
|
181
|
+
## NB: let's ignore case (that is, UPCASE,downcase); always use /i flag
|
182
|
+
/#{group}/i
|
183
|
+
end
|
184
|
+
|
185
|
+
def regex_round_getter
|
186
|
+
## todo: escape for regex?
|
187
|
+
## todo: sort by length - biggest words go first? does regex match biggest word automatically?? - check
|
188
|
+
## todo/fix: make - optional e.g. convert to ( |-) or better [ \-] ??
|
189
|
+
## NB: let's ignore case (that is, UPCASE,downcase); always use /i flag
|
190
|
+
/#{round}/i
|
191
|
+
end
|
192
|
+
|
193
|
+
def regex_knockout_round_getter
|
194
|
+
## todo: escape for regex?
|
195
|
+
## todo: sort by length - biggest words go first? does regex match biggest word automatically?? - check
|
196
|
+
## todo/fix: make - optional e.g. convert to ( |-) or better [ \-] ??
|
197
|
+
## NB: let's ignore case (that is, UPCASE,downcase); always use /i flag
|
198
|
+
/#{knockout_round}/i
|
199
|
+
end
|
200
|
+
|
201
|
+
def regex_leg1_getter
|
202
|
+
## todo: escape for regex?
|
203
|
+
## NB: let's ignore case (that is, UPCASE,downcase); always use /i flag
|
204
|
+
/#{leg1}/i
|
205
|
+
end
|
206
|
+
|
207
|
+
def regex_leg2_getter
|
208
|
+
## todo: escape for regex?
|
209
|
+
## NB: let's ignore case (that is, UPCASE,downcase); always use /i flag
|
210
|
+
/#{leg2}/i
|
211
|
+
end
|
212
|
+
|
213
|
+
end # class Lang
|
214
|
+
|
215
|
+
|
216
|
+
end # module SportDb
|
@@ -6,7 +6,7 @@ module Formats
|
|
6
6
|
|
7
7
|
MAJOR = 0 ## todo: namespace inside version or something - why? why not??
|
8
8
|
MINOR = 1
|
9
|
-
PATCH =
|
9
|
+
PATCH = 2
|
10
10
|
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
11
11
|
|
12
12
|
def self.version
|
@@ -21,5 +21,10 @@ module Formats
|
|
21
21
|
File.expand_path( File.dirname(File.dirname(File.dirname(File.dirname(__FILE__)))) )
|
22
22
|
end
|
23
23
|
|
24
|
+
|
25
|
+
def self.config_path
|
26
|
+
"#{root}/config"
|
27
|
+
end
|
28
|
+
|
24
29
|
end # module Formats
|
25
30
|
end # module SportDb
|