rsssf 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,157 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Rsssf
4
- module Filters
5
-
6
- def html_to_txt( html )
7
-
8
- ###
9
- # todo: check if any tags (still) present??
10
-
11
-
12
- ## cut off everything before body
13
- html = html.sub( /.+?<BODY>\s*/im, '' )
14
-
15
- ## cut off everything after body (closing)
16
- html = html.sub( /<\/BODY>.*/im, '' )
17
-
18
-
19
- ## remove cite
20
- html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
21
- puts " remove cite >#{$1}<"
22
- "#{$1}"
23
- end
24
-
25
- html = html.gsub( /\s*<HR>\s*/im ) do |match|
26
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
27
- puts " replace horizontal rule (hr) - >#{match}<"
28
- "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
29
- end
30
-
31
- ## replace break (br)
32
- ## note: do NOT use m/multiline for now - why? why not??
33
- html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
34
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
35
- puts " replace break (br) - >#{match}<"
36
- "\n"
37
- end
38
-
39
- ## remove anchors (a name)
40
- html = html.gsub( /<A NAME[^>]*>(.+?)<\/A>/im ) do |match| ## note: use .+? non-greedy match
41
- title = $1.to_s ## note: "save" caputure first; gets replaced by gsub (next regex call)
42
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
43
- puts " replace anchor (a) name >#{title}< - >#{match}<"
44
- "#{title}"
45
- end
46
-
47
- ## remove anchors (a href)
48
- # note: heading 4 includes anchor (thus, let anchors go first)
49
- # note: <a \newline href is used for authors email - thus incl. support for newline as space
50
- html = html.gsub( /<A\s+HREF[^>]*>(.+?)<\/A>/im ) do |_| ## note: use .+? non-greedy match
51
- puts " replace anchor (a) href >#{$1}<"
52
- "‹#{$1}›"
53
- end
54
-
55
- ## replace paragrah (p)
56
- html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
57
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
58
- puts " replace paragraph (p) - >#{match}<"
59
- "\n\n"
60
- end
61
- html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
62
-
63
- ## remove i
64
- html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
65
- puts " remove italic (i) >#{$1}<"
66
- "#{$1}"
67
- end
68
-
69
-
70
- ## heading 2
71
- html = html.gsub( /\s*<H2>([^<]+)<\/H2>\s*/im ) do |_|
72
- puts " replace heading 2 (h2) >#{$1}<"
73
- "\n\n## #{$1}\n\n" ## note: make sure to always add two newlines
74
- end
75
-
76
- ## heading 4
77
- html = html.gsub( /\s*<H4>([^<]+)<\/H4>\s*/im ) do |_|
78
- puts " replace heading 4 (h4) >#{$1}<"
79
- "\n\n#### #{$1}\n\n" ## note: make sure to always add two newlines
80
- end
81
-
82
-
83
- ## remove b - note: might include anchors (thus, call after anchors)
84
- html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
85
- puts " remove bold (b) >#{$1}<"
86
- "**#{$1}**"
87
- end
88
-
89
- ## replace preformatted (pre)
90
- html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
91
- puts " replace preformatted (pre)"
92
- '' # replace w/ nothing for now (keep surrounding newlines)
93
- end
94
-
95
- =begin
96
- puts
97
- puts
98
- puts "html:"
99
- puts html[0..2000]
100
- puts "-- snip --"
101
- puts html[-1000..-1] ## print last hundred chars
102
- =end
103
-
104
-
105
- ## cleanup whitespaces
106
- ## todo/fix: convert newline in space first
107
- ## and than collapse spaces etc.!!!
108
- txt = ''
109
- html.each_line do |line|
110
- line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
111
- line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
112
-
113
- txt << line
114
- txt << "\n"
115
- end
116
-
117
- ### remove emails etc.
118
- txt = sanitize( txt )
119
-
120
- txt
121
- end # method html_to_text
122
-
123
-
124
-
125
- def sanitize( txt )
126
- ### remove emails for (spam/privacy) protection
127
- ## e.g. (selamm@example.es)
128
- ## (buuu@mscs.dal.ca)
129
- ## (kaxx@rsssf.com)
130
- ## (Manu_Maya@yakoo.co)
131
-
132
- ## note add support for optional ‹› enclosure (used by html2txt converted a href :mailto links)
133
- ## e.g. (‹selamm@example.es›)
134
-
135
- email_pattern = "\\(‹?[a-z][a-z0-9_]+@[a-z]+(\\.[a-z]+)+›?\\)" ## note: just a string; needs to escape \\ twice!!!
136
-
137
- ## check for "free-standing e.g. on its own line" emails only for now
138
- txt = txt.gsub( /\n#{email_pattern}\n/i ) do |match|
139
- puts "removing (free-standing) email >#{match}<"
140
- "\n" # return empty line
141
- end
142
-
143
- txt = txt.gsub( /#{email_pattern}/i ) do |match|
144
- puts "remove email >#{match}<"
145
- ''
146
- end
147
-
148
- txt
149
- end # method sanitize
150
-
151
- end # module Filters
152
- end # module Rsssf
153
-
154
- ## add (shortcut) alias
155
- RsssfFilters = Rsssf::Filters
156
-
157
-
data/lib/rsssf/patch.rb DELETED
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Rsssf
4
-
5
- class Patcher
6
-
7
- ## e.g. 2008/09
8
- ## note: also support 1999/2000
9
- SEASON = '\d{4}\/(\d{2}|\d{4})' ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
10
-
11
- def patch_heading( txt, rxs, title )
12
- rxs.each do |rx|
13
- txt = txt.sub( rx ) do |match|
14
- match = match.gsub( "\n", '$$') ## change newlines to $$ for single-line outputs/dumps
15
- puts " found heading >#{match}<"
16
- "\n\n#### #{title}\n\n"
17
- end
18
- end
19
- txt
20
- end
21
-
22
-
23
- end # class Patcher
24
- end ## module Rsssf
25
-
26
- ## add (shortcut) alias
27
- RsssfPatcher = Rsssf::Patcher
28
-
data/test/helper.rb DELETED
@@ -1,12 +0,0 @@
1
-
2
- ## $:.unshift(File.dirname(__FILE__))
3
-
4
- ## minitest setup
5
-
6
- require 'minitest/autorun'
7
-
8
-
9
- ## our own code
10
-
11
- require 'rsssf'
12
-
data/test/test_utils.rb DELETED
@@ -1,83 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'helper'
4
-
5
- class TestUtils < MiniTest::Test
6
-
7
- include RsssfUtils ## e.g. year_from_name etc.
8
-
9
- def test_year
10
-
11
- ###########
12
- ## year_from_name
13
- ## note: num <= 16 - assume 20xx for now from 00..16
14
- ## - else 19xx
15
- assert_equal 2000, year_from_name( 'duit00' )
16
- assert_equal 2016, year_from_name( 'duit16' )
17
-
18
- assert_equal 1999, year_from_name( 'duit99' )
19
-
20
- assert_equal 2001, year_from_name( 'duit2001' )
21
-
22
- assert_equal 1964, year_from_name( 'duit64' )
23
- assert_equal 1965, year_from_name( 'duit1965' )
24
- assert_equal 2011, year_from_name( 'duit2011' )
25
-
26
-
27
- ####
28
- # year_from_file
29
-
30
- assert_equal 2000, year_from_file( 'duit00.txt' )
31
- assert_equal 2000, year_from_file( 'duit00.html' )
32
- assert_equal 2000, year_from_file( './duit00.txt' )
33
- assert_equal 2000, year_from_file( 'xxx/1998/xxx/duit00.txt' )
34
-
35
- assert_equal 2016, year_from_file( 'duit16.txt' )
36
- assert_equal 2016, year_from_file( 'duit16.html' )
37
-
38
- assert_equal 2001, year_from_file( 'duit2001.txt' )
39
- assert_equal 2001, year_from_file( 'duit2001.html' )
40
- assert_equal 2001, year_from_file( 'xx/1990s/1997/xxx/duit2001.txt' )
41
-
42
- assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.txt' )
43
- assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.txt' ) # check w/ numbers in path
44
- assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.txt' ) # check w/ numbers in path
45
- assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.txt' )
46
- assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.txt' )
47
-
48
- assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.html' )
49
- assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.html' ) # check w/ numbers in path
50
- assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.html' ) # check w/ numbers in path
51
- assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.html' )
52
- assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.html' )
53
-
54
-
55
- #####
56
- ## year_to_season
57
-
58
- assert_equal '1998-99', year_to_season( 1999 )
59
- assert_equal '1999-00', year_to_season( 2000 ) ## todo: use 1999-2000 - why? why not??
60
- assert_equal '2000-01', year_to_season( 2001 )
61
- assert_equal '2014-15', year_to_season( 2015 )
62
-
63
- assert_equal '1999-00', year_to_season( 0 )
64
- assert_equal '1963-64', year_to_season( 64 )
65
- assert_equal '1998-99', year_to_season( 99 )
66
- assert_equal '1964-65', year_to_season( 1965 )
67
- assert_equal '2010-11', year_to_season( 2011 )
68
-
69
-
70
- #######
71
- ## archive_dir_for_year
72
- ## note: year <= 2010 use season 2009-10
73
-
74
- assert_equal 'archive/1990s/1998-99', archive_dir_for_year( 1999 )
75
- assert_equal 'archive/2000s/2000-01', archive_dir_for_year( 2001 )
76
- assert_equal '2014-15', archive_dir_for_year( 2015 )
77
-
78
-
79
- assert true ## everything ok if get here
80
- end
81
-
82
- end # class TestUtils
83
-