rsssf 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,157 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Rsssf
4
- module Filters
5
-
6
- def html_to_txt( html )
7
-
8
- ###
9
- # todo: check if any tags (still) present??
10
-
11
-
12
- ## cut off everything before body
13
- html = html.sub( /.+?<BODY>\s*/im, '' )
14
-
15
- ## cut off everything after body (closing)
16
- html = html.sub( /<\/BODY>.*/im, '' )
17
-
18
-
19
- ## remove cite
20
- html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
21
- puts " remove cite >#{$1}<"
22
- "#{$1}"
23
- end
24
-
25
- html = html.gsub( /\s*<HR>\s*/im ) do |match|
26
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
27
- puts " replace horizontal rule (hr) - >#{match}<"
28
- "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
29
- end
30
-
31
- ## replace break (br)
32
- ## note: do NOT use m/multiline for now - why? why not??
33
- html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
34
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
35
- puts " replace break (br) - >#{match}<"
36
- "\n"
37
- end
38
-
39
- ## remove anchors (a name)
40
- html = html.gsub( /<A NAME[^>]*>(.+?)<\/A>/im ) do |match| ## note: use .+? non-greedy match
41
- title = $1.to_s ## note: "save" caputure first; gets replaced by gsub (next regex call)
42
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
43
- puts " replace anchor (a) name >#{title}< - >#{match}<"
44
- "#{title}"
45
- end
46
-
47
- ## remove anchors (a href)
48
- # note: heading 4 includes anchor (thus, let anchors go first)
49
- # note: <a \newline href is used for authors email - thus incl. support for newline as space
50
- html = html.gsub( /<A\s+HREF[^>]*>(.+?)<\/A>/im ) do |_| ## note: use .+? non-greedy match
51
- puts " replace anchor (a) href >#{$1}<"
52
- "‹#{$1}›"
53
- end
54
-
55
- ## replace paragrah (p)
56
- html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
57
- match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
58
- puts " replace paragraph (p) - >#{match}<"
59
- "\n\n"
60
- end
61
- html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
62
-
63
- ## remove i
64
- html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
65
- puts " remove italic (i) >#{$1}<"
66
- "#{$1}"
67
- end
68
-
69
-
70
- ## heading 2
71
- html = html.gsub( /\s*<H2>([^<]+)<\/H2>\s*/im ) do |_|
72
- puts " replace heading 2 (h2) >#{$1}<"
73
- "\n\n## #{$1}\n\n" ## note: make sure to always add two newlines
74
- end
75
-
76
- ## heading 4
77
- html = html.gsub( /\s*<H4>([^<]+)<\/H4>\s*/im ) do |_|
78
- puts " replace heading 4 (h4) >#{$1}<"
79
- "\n\n#### #{$1}\n\n" ## note: make sure to always add two newlines
80
- end
81
-
82
-
83
- ## remove b - note: might include anchors (thus, call after anchors)
84
- html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
85
- puts " remove bold (b) >#{$1}<"
86
- "**#{$1}**"
87
- end
88
-
89
- ## replace preformatted (pre)
90
- html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
91
- puts " replace preformatted (pre)"
92
- '' # replace w/ nothing for now (keep surrounding newlines)
93
- end
94
-
95
- =begin
96
- puts
97
- puts
98
- puts "html:"
99
- puts html[0..2000]
100
- puts "-- snip --"
101
- puts html[-1000..-1] ## print last hundred chars
102
- =end
103
-
104
-
105
- ## cleanup whitespaces
106
- ## todo/fix: convert newline in space first
107
- ## and than collapse spaces etc.!!!
108
- txt = ''
109
- html.each_line do |line|
110
- line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
111
- line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
112
-
113
- txt << line
114
- txt << "\n"
115
- end
116
-
117
- ### remove emails etc.
118
- txt = sanitize( txt )
119
-
120
- txt
121
- end # method html_to_text
122
-
123
-
124
-
125
- def sanitize( txt )
126
- ### remove emails for (spam/privacy) protection
127
- ## e.g. (selamm@example.es)
128
- ## (buuu@mscs.dal.ca)
129
- ## (kaxx@rsssf.com)
130
- ## (Manu_Maya@yakoo.co)
131
-
132
- ## note add support for optional ‹› enclosure (used by html2txt converted a href :mailto links)
133
- ## e.g. (‹selamm@example.es›)
134
-
135
- email_pattern = "\\(‹?[a-z][a-z0-9_]+@[a-z]+(\\.[a-z]+)+›?\\)" ## note: just a string; needs to escape \\ twice!!!
136
-
137
- ## check for "free-standing e.g. on its own line" emails only for now
138
- txt = txt.gsub( /\n#{email_pattern}\n/i ) do |match|
139
- puts "removing (free-standing) email >#{match}<"
140
- "\n" # return empty line
141
- end
142
-
143
- txt = txt.gsub( /#{email_pattern}/i ) do |match|
144
- puts "remove email >#{match}<"
145
- ''
146
- end
147
-
148
- txt
149
- end # method sanitize
150
-
151
- end # module Filters
152
- end # module Rsssf
153
-
154
- ## add (shortcut) alias
155
- RsssfFilters = Rsssf::Filters
156
-
157
-
data/lib/rsssf/patch.rb DELETED
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Rsssf
4
-
5
- class Patcher
6
-
7
- ## e.g. 2008/09
8
- ## note: also support 1999/2000
9
- SEASON = '\d{4}\/(\d{2}|\d{4})' ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
10
-
11
- def patch_heading( txt, rxs, title )
12
- rxs.each do |rx|
13
- txt = txt.sub( rx ) do |match|
14
- match = match.gsub( "\n", '$$') ## change newlines to $$ for single-line outputs/dumps
15
- puts " found heading >#{match}<"
16
- "\n\n#### #{title}\n\n"
17
- end
18
- end
19
- txt
20
- end
21
-
22
-
23
- end # class Patcher
24
- end ## module Rsssf
25
-
26
- ## add (shortcut) alias
27
- RsssfPatcher = Rsssf::Patcher
28
-
data/test/helper.rb DELETED
@@ -1,12 +0,0 @@
1
-
2
- ## $:.unshift(File.dirname(__FILE__))
3
-
4
- ## minitest setup
5
-
6
- require 'minitest/autorun'
7
-
8
-
9
- ## our own code
10
-
11
- require 'rsssf'
12
-
data/test/test_utils.rb DELETED
@@ -1,83 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'helper'
4
-
5
- class TestUtils < MiniTest::Test
6
-
7
- include RsssfUtils ## e.g. year_from_name etc.
8
-
9
- def test_year
10
-
11
- ###########
12
- ## year_from_name
13
- ## note: num <= 16 - assume 20xx for now from 00..16
14
- ## - else 19xx
15
- assert_equal 2000, year_from_name( 'duit00' )
16
- assert_equal 2016, year_from_name( 'duit16' )
17
-
18
- assert_equal 1999, year_from_name( 'duit99' )
19
-
20
- assert_equal 2001, year_from_name( 'duit2001' )
21
-
22
- assert_equal 1964, year_from_name( 'duit64' )
23
- assert_equal 1965, year_from_name( 'duit1965' )
24
- assert_equal 2011, year_from_name( 'duit2011' )
25
-
26
-
27
- ####
28
- # year_from_file
29
-
30
- assert_equal 2000, year_from_file( 'duit00.txt' )
31
- assert_equal 2000, year_from_file( 'duit00.html' )
32
- assert_equal 2000, year_from_file( './duit00.txt' )
33
- assert_equal 2000, year_from_file( 'xxx/1998/xxx/duit00.txt' )
34
-
35
- assert_equal 2016, year_from_file( 'duit16.txt' )
36
- assert_equal 2016, year_from_file( 'duit16.html' )
37
-
38
- assert_equal 2001, year_from_file( 'duit2001.txt' )
39
- assert_equal 2001, year_from_file( 'duit2001.html' )
40
- assert_equal 2001, year_from_file( 'xx/1990s/1997/xxx/duit2001.txt' )
41
-
42
- assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.txt' )
43
- assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.txt' ) # check w/ numbers in path
44
- assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.txt' ) # check w/ numbers in path
45
- assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.txt' )
46
- assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.txt' )
47
-
48
- assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.html' )
49
- assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.html' ) # check w/ numbers in path
50
- assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.html' ) # check w/ numbers in path
51
- assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.html' )
52
- assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.html' )
53
-
54
-
55
- #####
56
- ## year_to_season
57
-
58
- assert_equal '1998-99', year_to_season( 1999 )
59
- assert_equal '1999-00', year_to_season( 2000 ) ## todo: use 1999-2000 - why? why not??
60
- assert_equal '2000-01', year_to_season( 2001 )
61
- assert_equal '2014-15', year_to_season( 2015 )
62
-
63
- assert_equal '1999-00', year_to_season( 0 )
64
- assert_equal '1963-64', year_to_season( 64 )
65
- assert_equal '1998-99', year_to_season( 99 )
66
- assert_equal '1964-65', year_to_season( 1965 )
67
- assert_equal '2010-11', year_to_season( 2011 )
68
-
69
-
70
- #######
71
- ## archive_dir_for_year
72
- ## note: year <= 2010 use season 2009-10
73
-
74
- assert_equal 'archive/1990s/1998-99', archive_dir_for_year( 1999 )
75
- assert_equal 'archive/2000s/2000-01', archive_dir_for_year( 2001 )
76
- assert_equal '2014-15', archive_dir_for_year( 2015 )
77
-
78
-
79
- assert true ## everything ok if get here
80
- end
81
-
82
- end # class TestUtils
83
-