rsssf 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +2 -0
- data/Manifest.txt +3 -6
- data/README.md +43 -26
- data/Rakefile +8 -7
- data/lib/rsssf/convert.rb +495 -0
- data/lib/rsssf/download.rb +151 -0
- data/lib/rsssf/page.rb +70 -45
- data/lib/rsssf/repo.rb +77 -153
- data/lib/rsssf/reports/page.rb +30 -19
- data/lib/rsssf/reports/schedule.rb +111 -25
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +10 -29
- data/lib/rsssf/version.rb +3 -5
- data/lib/rsssf.rb +42 -19
- metadata +26 -25
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
data/lib/rsssf/html2txt.rb
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Rsssf
|
4
|
-
module Filters
|
5
|
-
|
6
|
-
def html_to_txt( html )
|
7
|
-
|
8
|
-
###
|
9
|
-
# todo: check if any tags (still) present??
|
10
|
-
|
11
|
-
|
12
|
-
## cut off everything before body
|
13
|
-
html = html.sub( /.+?<BODY>\s*/im, '' )
|
14
|
-
|
15
|
-
## cut off everything after body (closing)
|
16
|
-
html = html.sub( /<\/BODY>.*/im, '' )
|
17
|
-
|
18
|
-
|
19
|
-
## remove cite
|
20
|
-
html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
|
21
|
-
puts " remove cite >#{$1}<"
|
22
|
-
"#{$1}"
|
23
|
-
end
|
24
|
-
|
25
|
-
html = html.gsub( /\s*<HR>\s*/im ) do |match|
|
26
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
27
|
-
puts " replace horizontal rule (hr) - >#{match}<"
|
28
|
-
"\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
|
29
|
-
end
|
30
|
-
|
31
|
-
## replace break (br)
|
32
|
-
## note: do NOT use m/multiline for now - why? why not??
|
33
|
-
html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
|
34
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
35
|
-
puts " replace break (br) - >#{match}<"
|
36
|
-
"\n"
|
37
|
-
end
|
38
|
-
|
39
|
-
## remove anchors (a name)
|
40
|
-
html = html.gsub( /<A NAME[^>]*>(.+?)<\/A>/im ) do |match| ## note: use .+? non-greedy match
|
41
|
-
title = $1.to_s ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
42
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
43
|
-
puts " replace anchor (a) name >#{title}< - >#{match}<"
|
44
|
-
"#{title}"
|
45
|
-
end
|
46
|
-
|
47
|
-
## remove anchors (a href)
|
48
|
-
# note: heading 4 includes anchor (thus, let anchors go first)
|
49
|
-
# note: <a \newline href is used for authors email - thus incl. support for newline as space
|
50
|
-
html = html.gsub( /<A\s+HREF[^>]*>(.+?)<\/A>/im ) do |_| ## note: use .+? non-greedy match
|
51
|
-
puts " replace anchor (a) href >#{$1}<"
|
52
|
-
"‹#{$1}›"
|
53
|
-
end
|
54
|
-
|
55
|
-
## replace paragrah (p)
|
56
|
-
html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
|
57
|
-
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
58
|
-
puts " replace paragraph (p) - >#{match}<"
|
59
|
-
"\n\n"
|
60
|
-
end
|
61
|
-
html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
|
62
|
-
|
63
|
-
## remove i
|
64
|
-
html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
|
65
|
-
puts " remove italic (i) >#{$1}<"
|
66
|
-
"#{$1}"
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
## heading 2
|
71
|
-
html = html.gsub( /\s*<H2>([^<]+)<\/H2>\s*/im ) do |_|
|
72
|
-
puts " replace heading 2 (h2) >#{$1}<"
|
73
|
-
"\n\n## #{$1}\n\n" ## note: make sure to always add two newlines
|
74
|
-
end
|
75
|
-
|
76
|
-
## heading 4
|
77
|
-
html = html.gsub( /\s*<H4>([^<]+)<\/H4>\s*/im ) do |_|
|
78
|
-
puts " replace heading 4 (h4) >#{$1}<"
|
79
|
-
"\n\n#### #{$1}\n\n" ## note: make sure to always add two newlines
|
80
|
-
end
|
81
|
-
|
82
|
-
|
83
|
-
## remove b - note: might include anchors (thus, call after anchors)
|
84
|
-
html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
|
85
|
-
puts " remove bold (b) >#{$1}<"
|
86
|
-
"**#{$1}**"
|
87
|
-
end
|
88
|
-
|
89
|
-
## replace preformatted (pre)
|
90
|
-
html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
|
91
|
-
puts " replace preformatted (pre)"
|
92
|
-
'' # replace w/ nothing for now (keep surrounding newlines)
|
93
|
-
end
|
94
|
-
|
95
|
-
=begin
|
96
|
-
puts
|
97
|
-
puts
|
98
|
-
puts "html:"
|
99
|
-
puts html[0..2000]
|
100
|
-
puts "-- snip --"
|
101
|
-
puts html[-1000..-1] ## print last hundred chars
|
102
|
-
=end
|
103
|
-
|
104
|
-
|
105
|
-
## cleanup whitespaces
|
106
|
-
## todo/fix: convert newline in space first
|
107
|
-
## and than collapse spaces etc.!!!
|
108
|
-
txt = ''
|
109
|
-
html.each_line do |line|
|
110
|
-
line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
|
111
|
-
line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
|
112
|
-
|
113
|
-
txt << line
|
114
|
-
txt << "\n"
|
115
|
-
end
|
116
|
-
|
117
|
-
### remove emails etc.
|
118
|
-
txt = sanitize( txt )
|
119
|
-
|
120
|
-
txt
|
121
|
-
end # method html_to_text
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
def sanitize( txt )
|
126
|
-
### remove emails for (spam/privacy) protection
|
127
|
-
## e.g. (selamm@example.es)
|
128
|
-
## (buuu@mscs.dal.ca)
|
129
|
-
## (kaxx@rsssf.com)
|
130
|
-
## (Manu_Maya@yakoo.co)
|
131
|
-
|
132
|
-
## note add support for optional ‹› enclosure (used by html2txt converted a href :mailto links)
|
133
|
-
## e.g. (‹selamm@example.es›)
|
134
|
-
|
135
|
-
email_pattern = "\\(‹?[a-z][a-z0-9_]+@[a-z]+(\\.[a-z]+)+›?\\)" ## note: just a string; needs to escape \\ twice!!!
|
136
|
-
|
137
|
-
## check for "free-standing e.g. on its own line" emails only for now
|
138
|
-
txt = txt.gsub( /\n#{email_pattern}\n/i ) do |match|
|
139
|
-
puts "removing (free-standing) email >#{match}<"
|
140
|
-
"\n" # return empty line
|
141
|
-
end
|
142
|
-
|
143
|
-
txt = txt.gsub( /#{email_pattern}/i ) do |match|
|
144
|
-
puts "remove email >#{match}<"
|
145
|
-
''
|
146
|
-
end
|
147
|
-
|
148
|
-
txt
|
149
|
-
end # method sanitize
|
150
|
-
|
151
|
-
end # module Filters
|
152
|
-
end # module Rsssf
|
153
|
-
|
154
|
-
## add (shortcut) alias
|
155
|
-
RsssfFilters = Rsssf::Filters
|
156
|
-
|
157
|
-
|
data/lib/rsssf/patch.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Rsssf
|
4
|
-
|
5
|
-
class Patcher
|
6
|
-
|
7
|
-
## e.g. 2008/09
|
8
|
-
## note: also support 1999/2000
|
9
|
-
SEASON = '\d{4}\/(\d{2}|\d{4})' ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
|
10
|
-
|
11
|
-
def patch_heading( txt, rxs, title )
|
12
|
-
rxs.each do |rx|
|
13
|
-
txt = txt.sub( rx ) do |match|
|
14
|
-
match = match.gsub( "\n", '$$') ## change newlines to $$ for single-line outputs/dumps
|
15
|
-
puts " found heading >#{match}<"
|
16
|
-
"\n\n#### #{title}\n\n"
|
17
|
-
end
|
18
|
-
end
|
19
|
-
txt
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
end # class Patcher
|
24
|
-
end ## module Rsssf
|
25
|
-
|
26
|
-
## add (shortcut) alias
|
27
|
-
RsssfPatcher = Rsssf::Patcher
|
28
|
-
|
data/test/helper.rb
DELETED
data/test/test_utils.rb
DELETED
@@ -1,83 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'helper'
|
4
|
-
|
5
|
-
class TestUtils < MiniTest::Test
|
6
|
-
|
7
|
-
include RsssfUtils ## e.g. year_from_name etc.
|
8
|
-
|
9
|
-
def test_year
|
10
|
-
|
11
|
-
###########
|
12
|
-
## year_from_name
|
13
|
-
## note: num <= 16 - assume 20xx for now from 00..16
|
14
|
-
## - else 19xx
|
15
|
-
assert_equal 2000, year_from_name( 'duit00' )
|
16
|
-
assert_equal 2016, year_from_name( 'duit16' )
|
17
|
-
|
18
|
-
assert_equal 1999, year_from_name( 'duit99' )
|
19
|
-
|
20
|
-
assert_equal 2001, year_from_name( 'duit2001' )
|
21
|
-
|
22
|
-
assert_equal 1964, year_from_name( 'duit64' )
|
23
|
-
assert_equal 1965, year_from_name( 'duit1965' )
|
24
|
-
assert_equal 2011, year_from_name( 'duit2011' )
|
25
|
-
|
26
|
-
|
27
|
-
####
|
28
|
-
# year_from_file
|
29
|
-
|
30
|
-
assert_equal 2000, year_from_file( 'duit00.txt' )
|
31
|
-
assert_equal 2000, year_from_file( 'duit00.html' )
|
32
|
-
assert_equal 2000, year_from_file( './duit00.txt' )
|
33
|
-
assert_equal 2000, year_from_file( 'xxx/1998/xxx/duit00.txt' )
|
34
|
-
|
35
|
-
assert_equal 2016, year_from_file( 'duit16.txt' )
|
36
|
-
assert_equal 2016, year_from_file( 'duit16.html' )
|
37
|
-
|
38
|
-
assert_equal 2001, year_from_file( 'duit2001.txt' )
|
39
|
-
assert_equal 2001, year_from_file( 'duit2001.html' )
|
40
|
-
assert_equal 2001, year_from_file( 'xx/1990s/1997/xxx/duit2001.txt' )
|
41
|
-
|
42
|
-
assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.txt' )
|
43
|
-
assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.txt' ) # check w/ numbers in path
|
44
|
-
assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.txt' ) # check w/ numbers in path
|
45
|
-
assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.txt' )
|
46
|
-
assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.txt' )
|
47
|
-
|
48
|
-
assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.html' )
|
49
|
-
assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.html' ) # check w/ numbers in path
|
50
|
-
assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.html' ) # check w/ numbers in path
|
51
|
-
assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.html' )
|
52
|
-
assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.html' )
|
53
|
-
|
54
|
-
|
55
|
-
#####
|
56
|
-
## year_to_season
|
57
|
-
|
58
|
-
assert_equal '1998-99', year_to_season( 1999 )
|
59
|
-
assert_equal '1999-00', year_to_season( 2000 ) ## todo: use 1999-2000 - why? why not??
|
60
|
-
assert_equal '2000-01', year_to_season( 2001 )
|
61
|
-
assert_equal '2014-15', year_to_season( 2015 )
|
62
|
-
|
63
|
-
assert_equal '1999-00', year_to_season( 0 )
|
64
|
-
assert_equal '1963-64', year_to_season( 64 )
|
65
|
-
assert_equal '1998-99', year_to_season( 99 )
|
66
|
-
assert_equal '1964-65', year_to_season( 1965 )
|
67
|
-
assert_equal '2010-11', year_to_season( 2011 )
|
68
|
-
|
69
|
-
|
70
|
-
#######
|
71
|
-
## archive_dir_for_year
|
72
|
-
## note: year <= 2010 use season 2009-10
|
73
|
-
|
74
|
-
assert_equal 'archive/1990s/1998-99', archive_dir_for_year( 1999 )
|
75
|
-
assert_equal 'archive/2000s/2000-01', archive_dir_for_year( 2001 )
|
76
|
-
assert_equal '2014-15', archive_dir_for_year( 2015 )
|
77
|
-
|
78
|
-
|
79
|
-
assert true ## everything ok if get here
|
80
|
-
end
|
81
|
-
|
82
|
-
end # class TestUtils
|
83
|
-
|