rsssf 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +2 -0
- data/Manifest.txt +3 -6
- data/README.md +43 -26
- data/Rakefile +8 -7
- data/lib/rsssf/convert.rb +495 -0
- data/lib/rsssf/download.rb +151 -0
- data/lib/rsssf/page.rb +70 -45
- data/lib/rsssf/repo.rb +77 -153
- data/lib/rsssf/reports/page.rb +30 -19
- data/lib/rsssf/reports/schedule.rb +111 -25
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +10 -29
- data/lib/rsssf/version.rb +3 -5
- data/lib/rsssf.rb +42 -19
- metadata +26 -25
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
data/lib/rsssf/html2txt.rb
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Rsssf
|
4
|
-
module Filters
|
5
|
-
|
6
|
-
def html_to_txt( html )
|
7
|
-
|
8
|
-
###
|
9
|
-
# todo: check if any tags (still) present??
|
10
|
-
|
11
|
-
|
12
|
-
## cut off everything before body
|
13
|
-
html = html.sub( /.+?<BODY>\s*/im, '' )
|
14
|
-
|
15
|
-
## cut off everything after body (closing)
|
16
|
-
html = html.sub( /<\/BODY>.*/im, '' )
|
17
|
-
|
18
|
-
|
19
|
-
## remove cite
|
20
|
-
html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
|
21
|
-
puts " remove cite >#{$1}<"
|
22
|
-
"#{$1}"
|
23
|
-
end
|
24
|
-
|
25
|
-
html = html.gsub( /\s*<HR>\s*/im ) do |match|
|
26
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
27
|
-
puts " replace horizontal rule (hr) - >#{match}<"
|
28
|
-
"\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
|
29
|
-
end
|
30
|
-
|
31
|
-
## replace break (br)
|
32
|
-
## note: do NOT use m/multiline for now - why? why not??
|
33
|
-
html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
|
34
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
35
|
-
puts " replace break (br) - >#{match}<"
|
36
|
-
"\n"
|
37
|
-
end
|
38
|
-
|
39
|
-
## remove anchors (a name)
|
40
|
-
html = html.gsub( /<A NAME[^>]*>(.+?)<\/A>/im ) do |match| ## note: use .+? non-greedy match
|
41
|
-
title = $1.to_s ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
42
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
43
|
-
puts " replace anchor (a) name >#{title}< - >#{match}<"
|
44
|
-
"#{title}"
|
45
|
-
end
|
46
|
-
|
47
|
-
## remove anchors (a href)
|
48
|
-
# note: heading 4 includes anchor (thus, let anchors go first)
|
49
|
-
# note: <a \newline href is used for authors email - thus incl. support for newline as space
|
50
|
-
html = html.gsub( /<A\s+HREF[^>]*>(.+?)<\/A>/im ) do |_| ## note: use .+? non-greedy match
|
51
|
-
puts " replace anchor (a) href >#{$1}<"
|
52
|
-
"‹#{$1}›"
|
53
|
-
end
|
54
|
-
|
55
|
-
## replace paragrah (p)
|
56
|
-
html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
|
57
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
58
|
-
puts " replace paragraph (p) - >#{match}<"
|
59
|
-
"\n\n"
|
60
|
-
end
|
61
|
-
html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
|
62
|
-
|
63
|
-
## remove i
|
64
|
-
html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
|
65
|
-
puts " remove italic (i) >#{$1}<"
|
66
|
-
"#{$1}"
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
## heading 2
|
71
|
-
html = html.gsub( /\s*<H2>([^<]+)<\/H2>\s*/im ) do |_|
|
72
|
-
puts " replace heading 2 (h2) >#{$1}<"
|
73
|
-
"\n\n## #{$1}\n\n" ## note: make sure to always add two newlines
|
74
|
-
end
|
75
|
-
|
76
|
-
## heading 4
|
77
|
-
html = html.gsub( /\s*<H4>([^<]+)<\/H4>\s*/im ) do |_|
|
78
|
-
puts " replace heading 4 (h4) >#{$1}<"
|
79
|
-
"\n\n#### #{$1}\n\n" ## note: make sure to always add two newlines
|
80
|
-
end
|
81
|
-
|
82
|
-
|
83
|
-
## remove b - note: might include anchors (thus, call after anchors)
|
84
|
-
html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
|
85
|
-
puts " remove bold (b) >#{$1}<"
|
86
|
-
"**#{$1}**"
|
87
|
-
end
|
88
|
-
|
89
|
-
## replace preformatted (pre)
|
90
|
-
html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
|
91
|
-
puts " replace preformatted (pre)"
|
92
|
-
'' # replace w/ nothing for now (keep surrounding newlines)
|
93
|
-
end
|
94
|
-
|
95
|
-
=begin
|
96
|
-
puts
|
97
|
-
puts
|
98
|
-
puts "html:"
|
99
|
-
puts html[0..2000]
|
100
|
-
puts "-- snip --"
|
101
|
-
puts html[-1000..-1] ## print last hundred chars
|
102
|
-
=end
|
103
|
-
|
104
|
-
|
105
|
-
## cleanup whitespaces
|
106
|
-
## todo/fix: convert newline in space first
|
107
|
-
## and than collapse spaces etc.!!!
|
108
|
-
txt = ''
|
109
|
-
html.each_line do |line|
|
110
|
-
line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
|
111
|
-
line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
|
112
|
-
|
113
|
-
txt << line
|
114
|
-
txt << "\n"
|
115
|
-
end
|
116
|
-
|
117
|
-
### remove emails etc.
|
118
|
-
txt = sanitize( txt )
|
119
|
-
|
120
|
-
txt
|
121
|
-
end # method html_to_text
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
def sanitize( txt )
|
126
|
-
### remove emails for (spam/privacy) protection
|
127
|
-
## e.g. (selamm@example.es)
|
128
|
-
## (buuu@mscs.dal.ca)
|
129
|
-
## (kaxx@rsssf.com)
|
130
|
-
## (Manu_Maya@yakoo.co)
|
131
|
-
|
132
|
-
## note add support for optional ‹› enclosure (used by html2txt converted a href :mailto links)
|
133
|
-
## e.g. (‹selamm@example.es›)
|
134
|
-
|
135
|
-
email_pattern = "\\(‹?[a-z][a-z0-9_]+@[a-z]+(\\.[a-z]+)+›?\\)" ## note: just a string; needs to escape \\ twice!!!
|
136
|
-
|
137
|
-
## check for "free-standing e.g. on its own line" emails only for now
|
138
|
-
txt = txt.gsub( /\n#{email_pattern}\n/i ) do |match|
|
139
|
-
puts "removing (free-standing) email >#{match}<"
|
140
|
-
"\n" # return empty line
|
141
|
-
end
|
142
|
-
|
143
|
-
txt = txt.gsub( /#{email_pattern}/i ) do |match|
|
144
|
-
puts "remove email >#{match}<"
|
145
|
-
''
|
146
|
-
end
|
147
|
-
|
148
|
-
txt
|
149
|
-
end # method sanitize
|
150
|
-
|
151
|
-
end # module Filters
|
152
|
-
end # module Rsssf
|
153
|
-
|
154
|
-
## add (shortcut) alias
|
155
|
-
RsssfFilters = Rsssf::Filters
|
156
|
-
|
157
|
-
|
data/lib/rsssf/patch.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Rsssf
|
4
|
-
|
5
|
-
class Patcher
|
6
|
-
|
7
|
-
## e.g. 2008/09
|
8
|
-
## note: also support 1999/2000
|
9
|
-
SEASON = '\d{4}\/(\d{2}|\d{4})' ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
|
10
|
-
|
11
|
-
def patch_heading( txt, rxs, title )
|
12
|
-
rxs.each do |rx|
|
13
|
-
txt = txt.sub( rx ) do |match|
|
14
|
-
match = match.gsub( "\n", '$$') ## change newlines to $$ for single-line outputs/dumps
|
15
|
-
puts " found heading >#{match}<"
|
16
|
-
"\n\n#### #{title}\n\n"
|
17
|
-
end
|
18
|
-
end
|
19
|
-
txt
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
end # class Patcher
|
24
|
-
end ## module Rsssf
|
25
|
-
|
26
|
-
## add (shortcut) alias
|
27
|
-
RsssfPatcher = Rsssf::Patcher
|
28
|
-
|
data/test/helper.rb
DELETED
data/test/test_utils.rb
DELETED
@@ -1,83 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'helper'
|
4
|
-
|
5
|
-
class TestUtils < MiniTest::Test
|
6
|
-
|
7
|
-
include RsssfUtils ## e.g. year_from_name etc.
|
8
|
-
|
9
|
-
def test_year
|
10
|
-
|
11
|
-
###########
|
12
|
-
## year_from_name
|
13
|
-
## note: num <= 16 - assume 20xx for now from 00..16
|
14
|
-
## - else 19xx
|
15
|
-
assert_equal 2000, year_from_name( 'duit00' )
|
16
|
-
assert_equal 2016, year_from_name( 'duit16' )
|
17
|
-
|
18
|
-
assert_equal 1999, year_from_name( 'duit99' )
|
19
|
-
|
20
|
-
assert_equal 2001, year_from_name( 'duit2001' )
|
21
|
-
|
22
|
-
assert_equal 1964, year_from_name( 'duit64' )
|
23
|
-
assert_equal 1965, year_from_name( 'duit1965' )
|
24
|
-
assert_equal 2011, year_from_name( 'duit2011' )
|
25
|
-
|
26
|
-
|
27
|
-
####
|
28
|
-
# year_from_file
|
29
|
-
|
30
|
-
assert_equal 2000, year_from_file( 'duit00.txt' )
|
31
|
-
assert_equal 2000, year_from_file( 'duit00.html' )
|
32
|
-
assert_equal 2000, year_from_file( './duit00.txt' )
|
33
|
-
assert_equal 2000, year_from_file( 'xxx/1998/xxx/duit00.txt' )
|
34
|
-
|
35
|
-
assert_equal 2016, year_from_file( 'duit16.txt' )
|
36
|
-
assert_equal 2016, year_from_file( 'duit16.html' )
|
37
|
-
|
38
|
-
assert_equal 2001, year_from_file( 'duit2001.txt' )
|
39
|
-
assert_equal 2001, year_from_file( 'duit2001.html' )
|
40
|
-
assert_equal 2001, year_from_file( 'xx/1990s/1997/xxx/duit2001.txt' )
|
41
|
-
|
42
|
-
assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.txt' )
|
43
|
-
assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.txt' ) # check w/ numbers in path
|
44
|
-
assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.txt' ) # check w/ numbers in path
|
45
|
-
assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.txt' )
|
46
|
-
assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.txt' )
|
47
|
-
|
48
|
-
assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.html' )
|
49
|
-
assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.html' ) # check w/ numbers in path
|
50
|
-
assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.html' ) # check w/ numbers in path
|
51
|
-
assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.html' )
|
52
|
-
assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.html' )
|
53
|
-
|
54
|
-
|
55
|
-
#####
|
56
|
-
## year_to_season
|
57
|
-
|
58
|
-
assert_equal '1998-99', year_to_season( 1999 )
|
59
|
-
assert_equal '1999-00', year_to_season( 2000 ) ## todo: use 1999-2000 - why? why not??
|
60
|
-
assert_equal '2000-01', year_to_season( 2001 )
|
61
|
-
assert_equal '2014-15', year_to_season( 2015 )
|
62
|
-
|
63
|
-
assert_equal '1999-00', year_to_season( 0 )
|
64
|
-
assert_equal '1963-64', year_to_season( 64 )
|
65
|
-
assert_equal '1998-99', year_to_season( 99 )
|
66
|
-
assert_equal '1964-65', year_to_season( 1965 )
|
67
|
-
assert_equal '2010-11', year_to_season( 2011 )
|
68
|
-
|
69
|
-
|
70
|
-
#######
|
71
|
-
## archive_dir_for_year
|
72
|
-
## note: year <= 2010 use season 2009-10
|
73
|
-
|
74
|
-
assert_equal 'archive/1990s/1998-99', archive_dir_for_year( 1999 )
|
75
|
-
assert_equal 'archive/2000s/2000-01', archive_dir_for_year( 2001 )
|
76
|
-
assert_equal '2014-15', archive_dir_for_year( 2015 )
|
77
|
-
|
78
|
-
|
79
|
-
assert true ## everything ok if get here
|
80
|
-
end
|
81
|
-
|
82
|
-
end # class TestUtils
|
83
|
-
|