rsssf 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +2 -0
- data/Manifest.txt +3 -6
- data/README.md +43 -26
- data/Rakefile +8 -7
- data/lib/rsssf/convert.rb +495 -0
- data/lib/rsssf/download.rb +151 -0
- data/lib/rsssf/page.rb +70 -45
- data/lib/rsssf/repo.rb +77 -153
- data/lib/rsssf/reports/page.rb +30 -19
- data/lib/rsssf/reports/schedule.rb +111 -25
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +10 -29
- data/lib/rsssf/version.rb +3 -5
- data/lib/rsssf.rb +42 -19
- metadata +26 -25
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 92c74803ad71cb9cac8376ef3e0e01890352fc0a0edb01208eab3a9c41f60767
|
4
|
+
data.tar.gz: 874bdc292143352c88e23b44ed23abb98312d4afd2fd2cc797d078bce1eef0ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cc04f3a78663d870ed8a4d5b77813a1601e49a67fe3ea9265972f8da5b43b88adde78d184bedc320edb29439d7abb6439e6b045181bedea48c6d6be037d1a86
|
7
|
+
data.tar.gz: e92c7acc956eacd756665e83baee9373cf2af422da5deddd51e91e1266d7498e005a8d0b2e88868a23b3929c7b1900ef5bf20c7bd02e260bd07152e3e53d850a
|
data/{HISTORY.md → CHANGELOG.md}
RENAMED
data/Manifest.txt
CHANGED
@@ -1,17 +1,14 @@
|
|
1
|
-
|
1
|
+
CHANGELOG.md
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
5
|
lib/rsssf.rb
|
6
|
-
lib/rsssf/
|
7
|
-
lib/rsssf/
|
6
|
+
lib/rsssf/convert.rb
|
7
|
+
lib/rsssf/download.rb
|
8
8
|
lib/rsssf/page.rb
|
9
|
-
lib/rsssf/patch.rb
|
10
9
|
lib/rsssf/repo.rb
|
11
10
|
lib/rsssf/reports/page.rb
|
12
11
|
lib/rsssf/reports/schedule.rb
|
13
12
|
lib/rsssf/schedule.rb
|
14
13
|
lib/rsssf/utils.rb
|
15
14
|
lib/rsssf/version.rb
|
16
|
-
test/helper.rb
|
17
|
-
test/test_utils.rb
|
data/README.md
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
# rsssf - tools 'n' scripts for RSSSF (Rec.Sport.Soccer Statistics Foundation) archive data
|
2
2
|
|
3
3
|
|
4
|
-
* home :: [github.com/sportdb/
|
5
|
-
* bugs :: [github.com/sportdb/
|
4
|
+
* home :: [github.com/sportdb/sport.db.sources](https://github.com/sportdb/sport.db.sources)
|
5
|
+
* bugs :: [github.com/sportdb/sport.db.sources/issues](https://github.com/sportdb/sport.db.sources/issues)
|
6
6
|
* gem :: [rubygems.org/gems/rsssf](https://rubygems.org/gems/rsssf)
|
7
7
|
* rdoc :: [rubydoc.info/gems/rsssf](http://rubydoc.info/gems/rsssf)
|
8
|
-
* forum :: [opensport](http://groups.google.com/group/opensport)
|
9
8
|
|
10
9
|
|
11
|
-
## What's the Rec.Sport.Soccer Statistics Foundation (RSSSF)?
|
12
10
|
|
13
|
-
The RSSSF collects and offers football (soccer) league tables, match results and more
|
14
|
-
from all over the world online in plain text.
|
15
11
|
|
16
|
-
|
12
|
+
## What's the Rec.Sport.Soccer Statistics Foundation (RSSSF)?
|
13
|
+
|
14
|
+
The RSSSF collects and offers football (soccer) league tables, match results and more
|
15
|
+
from all over the world online in plain text. Example:
|
17
16
|
|
18
17
|
```
|
19
18
|
Round 1
|
@@ -46,15 +45,41 @@ Coritiba 2-1 Atlético/MG
|
|
46
45
|
|
47
46
|
## Usage
|
48
47
|
|
49
|
-
### Working with Pages
|
50
48
|
|
51
|
-
|
49
|
+
### Download (and Cache ) Pages
|
50
|
+
|
51
|
+
To download (and cache) pages from the world wide web use:
|
52
52
|
|
53
53
|
``` ruby
|
54
|
-
|
54
|
+
Rsssf.download_page( 'https://rsssf.org/tablese/eng2024.html',
|
55
|
+
encoding: 'Windows-1252' )
|
56
|
+
|
57
|
+
Rsssf.download_page( 'https://rsssf.org/tablesb/braz2024.html',
|
58
|
+
encoding: 'Windows-1252' )
|
59
|
+
```
|
60
|
+
|
61
|
+
Note: Most pages on rsssf.org use the Windows-1252 (character) encoding.
|
62
|
+
To "auto-magically" convert to unicode (utf-8)
|
63
|
+
add the encoding option (default is `UTF-8`).
|
64
|
+
|
65
|
+
Or as a convenience shortcut download (pre-configured table) pages by country code (e.g `eng` - England, `es` - Spain (España), `de` - Germany (Deutschland), `br` - Brazil (Brasil) etc.)
|
66
|
+
and season (e.g. `2023/24` or `2024` etc.)
|
67
|
+
|
68
|
+
``` ruby
|
69
|
+
Rsssf.download_table( 'eng', season: '2023/24' )
|
70
|
+
|
71
|
+
Rsssf.download_table( 'br', season: '2024' )
|
55
72
|
```
|
56
73
|
|
57
|
-
|
74
|
+
|
75
|
+
Note: The rsssf machinery uses a built-in web cache. All downloads get "auto-magically" cached (in `./cache/rsssf.org`).
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
### Working with Pages
|
80
|
+
|
81
|
+
|
82
|
+
Note: The `RsssfPage` machinery will convert the rsssf archive page
|
58
83
|
from hypertext (HTML) to plain text e.g.
|
59
84
|
|
60
85
|
```
|
@@ -121,7 +146,7 @@ Step 2: Fetch all archive pages
|
|
121
146
|
Use:
|
122
147
|
|
123
148
|
``` ruby
|
124
|
-
repo = RsssfRepo.new( './
|
149
|
+
repo = RsssfRepo.new( './england', title: 'England (and Wales)' )
|
125
150
|
repo.fetch_pages
|
126
151
|
```
|
127
152
|
|
@@ -139,7 +164,7 @@ football.db RSSSF Archive Data Summary for England (and Wales)
|
|
139
164
|
|
140
165
|
_Last Update: 2015-11-26 18:22:22 +0200_
|
141
166
|
|
142
|
-
| Season | File | Authors | Last Updated | Lines (Chars) | Sections |
|
167
|
+
| Season | File | Authors | Last Updated | Lines (Chars) | Sections |
|
143
168
|
| :------ | :------ | :------- | :----------- | ------------: | :------- |
|
144
169
|
| 2014-15 | [eng2015.txt](https://github.com/rsssf/eng-england/blob/master/tables/eng2015.txt) | Ian King and Karel Stokkermans | 4 Jun 2015 | 1249 (34138) | Premier League, Cup Tournaments, Championship, Division 1, Division 2, Conference |
|
145
170
|
| 2013-14 | [eng2014.txt](https://github.com/rsssf/eng-england/blob/master/tables/eng2014.txt) | Ian King and Karel Stokkermans | 5 Feb 2015 | 1254 (34294) | Premier League, Cup Tournaments, Championship, Division 1, Division 2, Conference |
|
@@ -170,23 +195,15 @@ schedule.save( './facup.txt' )
|
|
170
195
|
|
171
196
|
|
172
197
|
|
173
|
-
## Install
|
174
|
-
|
175
|
-
Just install the gem:
|
176
|
-
|
177
|
-
$ gem install rsssf
|
178
|
-
|
179
|
-
|
180
|
-
|
181
198
|
## RSSSF Datasets
|
182
199
|
|
183
200
|
See the rsssf github org for pre-processed ready-to-import datasets. Prepared repos include:
|
184
201
|
|
185
|
-
- [`
|
186
|
-
- [`
|
187
|
-
- [`
|
188
|
-
- [`
|
189
|
-
- [`
|
202
|
+
- [`england`](https://github.com/rsssf/england) - rsssf archive data for England - Premier League, Championship, FA Cup etc.
|
203
|
+
- [`deutschland`](https://github.com/rsssf/deutschland) - rsssf archive data for Germany (Deutschland) - Deutsche Bundesliga, 2. Bundesliga, 3. Liga, DFB Pokal etc.
|
204
|
+
- [`espana`](https://github.com/rsssf/espana) - rsssf archive data for España (Spain) - Primera División / La Liga, Copa de Rey, etc.
|
205
|
+
- [`austria`](https://github.com/rsssf/austria) - rsssf archive data for Austria (Österreich) - Österr. Bundesliga, Erste Liga, ÖFB Pokal etc.
|
206
|
+
- [`brazil`](https://github.com/rsssf/brazil) - rsssf archive data for Brazil (Brasil) - Campeonato Brasileiro Série A / Brasileirão etc.
|
190
207
|
- and more
|
191
208
|
|
192
209
|
|
data/Rakefile
CHANGED
@@ -8,25 +8,26 @@ Hoe.spec 'rsssf' do
|
|
8
8
|
self.summary = "rsssf - tools 'n' scripts for RSSSF (Rec.Sport.Soccer Statistics Foundation) archive data"
|
9
9
|
self.description = summary
|
10
10
|
|
11
|
-
self.urls =
|
11
|
+
self.urls = { home: 'https://github.com/sportdb/sport.db.sources' }
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
14
|
+
self.email = 'gerald.bauer@gmail.com'
|
15
15
|
|
16
16
|
# switch extension to .markdown for gihub formatting
|
17
17
|
self.readme_file = 'README.md'
|
18
|
-
self.history_file = '
|
18
|
+
self.history_file = 'CHANGELOG.md'
|
19
19
|
|
20
20
|
self.extra_deps = [
|
21
|
-
['
|
22
|
-
['
|
23
|
-
['
|
21
|
+
['cocos'],
|
22
|
+
['season-formats'],
|
23
|
+
['rsssf-parser'], ## add rsssf parser machinery & tool
|
24
24
|
]
|
25
25
|
|
26
|
+
|
26
27
|
self.licenses = ['Public Domain']
|
27
28
|
|
28
29
|
self.spec_extras = {
|
29
|
-
required_ruby_version: '>=
|
30
|
+
required_ruby_version: '>= 2.2.2'
|
30
31
|
}
|
31
32
|
|
32
33
|
end
|
@@ -0,0 +1,495 @@
|
|
1
|
+
|
2
|
+
module Rsssf
|
3
|
+
class PageConverter
|
4
|
+
|
5
|
+
## convenience helper
|
6
|
+
def self.convert( html, url: )
|
7
|
+
@@converter ||= new ## use a "shared" built-in converter
|
8
|
+
@@converter.convert( html, url: url )
|
9
|
+
end
|
10
|
+
|
11
|
+
##
|
12
|
+
## add anchor: options or such
|
13
|
+
## lets you toggle adding anchors (§premier etc.) - why? why not?
|
14
|
+
|
15
|
+
def convert( html, url: )
|
16
|
+
### todo/fix: first check if html is all ascii-7bit e.g.
|
17
|
+
## includes only chars from 64 to 127!!!
|
18
|
+
|
19
|
+
## normalize newlines
|
20
|
+
## remove \r (form feed) used by Windows; just use \n (new line)
|
21
|
+
html = html.gsub( "\r", '' )
|
22
|
+
|
23
|
+
## check for html entities
|
24
|
+
html = html.gsub( "ä", 'ä' )
|
25
|
+
html = html.gsub( "ö", 'ö' )
|
26
|
+
html = html.gsub( "ü", 'ü' )
|
27
|
+
html = html.gsub( "Ä", 'Ä' )
|
28
|
+
html = html.gsub( "Ö", 'Ö' )
|
29
|
+
html = html.gsub( "Ü", 'Ü' )
|
30
|
+
html = html.gsub( "ß", 'ß' )
|
31
|
+
|
32
|
+
## typos / autofix - keep - why? why not?
|
33
|
+
html = html.gsub( "&oulm;", 'ö' ) ## support typo in entity (ö)
|
34
|
+
html = html.gsub( "¨", 'ü' ) ## support typo in entity (ü) - why? why not?
|
35
|
+
html = html.gsub( "&slig;", "ß" ) ## support typo in entity (ß)
|
36
|
+
html = html.gsub( "&aaacute;", "á" ) ## typo for á
|
37
|
+
|
38
|
+
|
39
|
+
html = html.gsub( "É", 'É' )
|
40
|
+
html = html.gsub( "ø", 'ø' )
|
41
|
+
html = html.gsub( "ã", 'ã' )
|
42
|
+
html = html.gsub( "õ", 'õ' )
|
43
|
+
html = html.gsub( "ô", 'ô' )
|
44
|
+
|
45
|
+
entities = %w[
|
46
|
+
À À
|
47
|
+
Á Á
|
48
|
+
 Â
|
49
|
+
à Ã
|
50
|
+
Ä Ä
|
51
|
+
Å Å
|
52
|
+
à à
|
53
|
+
á á
|
54
|
+
â â
|
55
|
+
ã ã
|
56
|
+
ä ä
|
57
|
+
å å
|
58
|
+
Æ Æ
|
59
|
+
æ æ
|
60
|
+
ß ß
|
61
|
+
Ç Ç
|
62
|
+
ç ç
|
63
|
+
È È
|
64
|
+
É É
|
65
|
+
Ê Ê
|
66
|
+
Ë Ë
|
67
|
+
è è
|
68
|
+
é é
|
69
|
+
ê ê
|
70
|
+
ë ë
|
71
|
+
Ì Ì
|
72
|
+
Í Í
|
73
|
+
Î Î
|
74
|
+
Ï Ï
|
75
|
+
ì ì
|
76
|
+
í í
|
77
|
+
î î
|
78
|
+
ï ï
|
79
|
+
Ñ Ñ
|
80
|
+
ñ ñ
|
81
|
+
Ò Ò
|
82
|
+
Ó Ó
|
83
|
+
Ô Ô
|
84
|
+
Õ Õ
|
85
|
+
Ö Ö
|
86
|
+
ò ò
|
87
|
+
ó ó
|
88
|
+
ô ô
|
89
|
+
õ õ
|
90
|
+
ö ö
|
91
|
+
Ø Ø
|
92
|
+
ø ø
|
93
|
+
Ù Ù
|
94
|
+
Ú Ú
|
95
|
+
Û Û
|
96
|
+
Ü Ü
|
97
|
+
ù ù
|
98
|
+
ú ú
|
99
|
+
û û
|
100
|
+
ü ü
|
101
|
+
Ý Ý
|
102
|
+
ý ý
|
103
|
+
ÿ ÿ
|
104
|
+
|
105
|
+
< <
|
106
|
+
> >
|
107
|
+
& &
|
108
|
+
© ©
|
109
|
+
® ®
|
110
|
+
|
111
|
+
Š Š
|
112
|
+
š š
|
113
|
+
č č
|
114
|
+
ć ć
|
115
|
+
Ž Ž
|
116
|
+
’ ’
|
117
|
+
]
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
entities.each_slice(2) do |str, entity|
|
122
|
+
html = html.gsub( entity, str )
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
##############
|
128
|
+
## check for more entities
|
129
|
+
## limit &---; to length 10 - why? why not?
|
130
|
+
html = html.gsub( /&[^; ]{1,10};/) do |match|
|
131
|
+
|
132
|
+
match = if match == 'ij' ## use like Van Dijk -> Van Dijk
|
133
|
+
'ij'
|
134
|
+
else
|
135
|
+
msg = "found unencoded html entity #{match}"
|
136
|
+
puts "*** WARN - #{msg}"
|
137
|
+
log( msg ) ## log too (see log.txt)
|
138
|
+
|
139
|
+
match ## pass through as is (1:1)
|
140
|
+
end
|
141
|
+
|
142
|
+
match
|
143
|
+
end
|
144
|
+
## todo/fix: add more entities
|
145
|
+
|
146
|
+
###################################
|
147
|
+
### smart quotes quick fixes
|
148
|
+
### convert all "smart" quote to (standard) single quotes
|
149
|
+
## D´Alessandro => D'Alessandro
|
150
|
+
|
151
|
+
html = html.gsub( '´', "'" )
|
152
|
+
|
153
|
+
html = html.gsub( '’', "'" )
|
154
|
+
html = html.gsub( '‘', "'" )
|
155
|
+
html = html.gsub( '“', '"' )
|
156
|
+
html = html.gsub( '”', '"' )
|
157
|
+
|
158
|
+
### convert fancy dashes/hyphens to plain dash/hyphen
|
159
|
+
html = html.gsub( '–', '-' )
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
txt = html_to_txt( html )
|
164
|
+
|
165
|
+
header = <<EOS
|
166
|
+
<!--
|
167
|
+
source: #{url}
|
168
|
+
-->
|
169
|
+
|
170
|
+
EOS
|
171
|
+
|
172
|
+
header+txt ## return txt w/ header
|
173
|
+
end ## method convert
|
174
|
+
|
175
|
+
|
176
|
+
## todo/fix - use generic heading regex for all h2/h3/h4 etc.
|
177
|
+
## exclude h1 - why? why not?
|
178
|
+
## note - include leading and trailing spaces !!!
|
179
|
+
##
|
180
|
+
## note - for content use non-greedy to allow
|
181
|
+
## match of tags inside content too
|
182
|
+
HEADING2_RE = %r{ \s*
|
183
|
+
<H2>
|
184
|
+
(?<title>.+?)
|
185
|
+
</H2>
|
186
|
+
\s*
|
187
|
+
}imx
|
188
|
+
|
189
|
+
HEADING4_RE = %r{ \s*
|
190
|
+
<H4>
|
191
|
+
(?<title>.+?)
|
192
|
+
</H4>
|
193
|
+
\s*
|
194
|
+
}imx
|
195
|
+
|
196
|
+
def replace_h2( html )
|
197
|
+
html.gsub( HEADING2_RE ) do |_|
|
198
|
+
m = Regexp.last_match
|
199
|
+
puts " replace heading 2 (h2) >#{m[:title]}<"
|
200
|
+
"\n\n## #{m[:title]}\n\n" ## note: make sure to always add two newlines
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def replace_h4( html )
|
205
|
+
html.gsub( HEADING4_RE ) do |_|
|
206
|
+
m = Regexp.last_match
|
207
|
+
puts " replace heading 4 (h4) >#{m[:title]}<"
|
208
|
+
"\n\n#### #{m[:title]}\n\n" ## note: make sure to always add two newlines
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
def squish( str )
|
214
|
+
## squish more than one white space to one space
|
215
|
+
str.gsub( /[ \r\t\n]+/, ' ' )
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
def patch_about( html )
|
220
|
+
# <A name=about>
|
221
|
+
# <H2>About this document</H2></A>
|
222
|
+
# or
|
223
|
+
# <A NAME="about"><H2>About this document</H2></A>
|
224
|
+
# => change to (possible?)
|
225
|
+
# <H2><A name=about>About this document</A></H2>
|
226
|
+
|
227
|
+
html.sub( %r{<A [ ] name=(about|"about")> \s*
|
228
|
+
<H2>About [ ] this [ ] document</H2></A>
|
229
|
+
}ixm,
|
230
|
+
"<H2><A name=about>About this document</A></H2>"
|
231
|
+
)
|
232
|
+
end
|
233
|
+
|
234
|
+
# <a name="sa">Série A</a>
|
235
|
+
# <a name="sd">Série D</a>
|
236
|
+
|
237
|
+
# <A name=about>
|
238
|
+
# <H2>About this document</H2></A>
|
239
|
+
# => change to (possible?)
|
240
|
+
# <H2><A name=about>About this document</A></H2>
|
241
|
+
#
|
242
|
+
#
|
243
|
+
# <h4><a name="cb">Copa do Brasil</a></h4>
|
244
|
+
|
245
|
+
## note - for content use non-greedy to allow
|
246
|
+
## match of tags inside content too
|
247
|
+
|
248
|
+
A_NAME_RE = %r{<A [ ]+ NAME [ ]* =
|
249
|
+
(?<name>[^>]+?)
|
250
|
+
>
|
251
|
+
(?<title>.+?)
|
252
|
+
</A>
|
253
|
+
}imx
|
254
|
+
|
255
|
+
# <a href="#sa">Série A</a><br>
|
256
|
+
#
|
257
|
+
# <A href="http://www.rsssf.org/">Rec.Sport.Soccer
|
258
|
+
# Statistics Foundation</A>
|
259
|
+
# <A href="http://www.rsssfbrasil.com">RSSSF
|
260
|
+
# Brazil</A>
|
261
|
+
#
|
262
|
+
# and Daniel Dalence (<A
|
263
|
+
# href="mailto:danielballack@terra.com.br">danielballack@terra.com.br</A>)
|
264
|
+
|
265
|
+
|
266
|
+
A_HREF_RE = %r{<A \s+ HREF [ ]* =
|
267
|
+
(?<href>[^>]+?)
|
268
|
+
>
|
269
|
+
(?<title>.+?)
|
270
|
+
<\/A>
|
271
|
+
}imx
|
272
|
+
|
273
|
+
|
274
|
+
def replace_a_href( html )
|
275
|
+
## remove anchors (a href)
|
276
|
+
# note: heading 4 includes anchor (thus, let anchors go first)
|
277
|
+
# note: <a \newline href is used for authors email - thus incl. support for newline as space
|
278
|
+
html.gsub( A_HREF_RE ) do |match| ## note: use .+? non-greedy match
|
279
|
+
m = Regexp.last_match
|
280
|
+
href = m[:href].gsub( /["']/, '' ).strip ## remove ("" or '')
|
281
|
+
title = m[:title].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
282
|
+
|
283
|
+
|
284
|
+
## e.g.
|
285
|
+
## ‹Larsen23@gmx.de, see page mailto:Larsen23@gmx.de›
|
286
|
+
## ‹danielballack@terra.com.br, see page mailto:danielballack@terra.com.br›
|
287
|
+
## ‹zja70@aol.com, see page mailto:zja70@aol.com›)
|
288
|
+
if href.start_with?( 'mailto:')
|
289
|
+
puts " blank mailto - anchor (a) href >#{href}, >#{title}<"
|
290
|
+
'‹mailto›' ## delete/remove email
|
291
|
+
else
|
292
|
+
puts " replace anchor (a) href >#{href}, >#{title}<"
|
293
|
+
|
294
|
+
## convert href to xref
|
295
|
+
xref = if href.start_with?('#') ## in-page ref
|
296
|
+
", see §#{href[1..-1]}"
|
297
|
+
elsif href.start_with?( /https?:/ ) ## external page ref
|
298
|
+
## skip - keep empty - why? why not? (or add url domain?)
|
299
|
+
''
|
300
|
+
else
|
301
|
+
## hack - check for some custom excludes
|
302
|
+
if title.start_with?( 'Rec.Sport.Soccer' )
|
303
|
+
## skip - keep empty
|
304
|
+
''
|
305
|
+
else
|
306
|
+
## strip (ending) .htm|html
|
307
|
+
", see page #{href.sub( /\.html?$/,'')}"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
"‹#{squish(title)}#{xref}›"
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
def replace_a_name( html )
|
317
|
+
##
|
318
|
+
## remove (named) anchors
|
319
|
+
html.gsub( A_NAME_RE ) do |match| ## note: use .+? non-greedy match
|
320
|
+
m = Regexp.last_match
|
321
|
+
name = m[:name].gsub( /["']/, '' ).strip ## remove ("" or '')
|
322
|
+
title = m[:title].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
323
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
324
|
+
puts " replace anchor (a) name >#{name}<, >#{title}< - >#{match}<"
|
325
|
+
|
326
|
+
|
327
|
+
##
|
328
|
+
## todo - report WARN if title incl. tags
|
329
|
+
## assumes text only for now - why? why not?
|
330
|
+
## add a name inside heading !!!
|
331
|
+
## do NOT add heading inside a name !!!
|
332
|
+
|
333
|
+
"#{title} ‹§#{name}›" ## note - use two spaces min (between title & name)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
|
338
|
+
EMAIL_RE = %r{ \s*
|
339
|
+
\(
|
340
|
+
[a-z][a-z0-9_]+
|
341
|
+
@[a-z]+(\.[a-z]+)+
|
342
|
+
\)
|
343
|
+
}imx
|
344
|
+
|
345
|
+
|
346
|
+
def remove_emails( html )
|
347
|
+
### remove converted ("blineded") mailto anchors
|
348
|
+
## note usually inside () e.g.
|
349
|
+
## (‹mailto›)
|
350
|
+
## plus slurp up all leading whitespace (incl. newline) - why? why not?
|
351
|
+
html = html.gsub( /\s*
|
352
|
+
\(‹mailto›\)
|
353
|
+
/xm, '' )
|
354
|
+
|
355
|
+
###
|
356
|
+
## remove "regular emails too e.g.
|
357
|
+
##
|
358
|
+
## Thanks to Marcelo Leme de Arruda (___@___.__.br),
|
359
|
+
## Ricardo FF Pontes (___@____.com),
|
360
|
+
## Santiago Reis (____@____.com.br),
|
361
|
+
## Marcos Lacerda Queiroz (___@____.com.br)
|
362
|
+
## etc.
|
363
|
+
|
364
|
+
## check for "free-standing e.g. on its own line" emails only for now
|
365
|
+
html = html.gsub( EMAIL_RE ) do |match|
|
366
|
+
puts "removing email >#{match}<"
|
367
|
+
''
|
368
|
+
end
|
369
|
+
html
|
370
|
+
end
|
371
|
+
|
372
|
+
|
373
|
+
|
374
|
+
def html_to_txt( html )
|
375
|
+
|
376
|
+
###
|
377
|
+
# todo: check if any tags (still) present??
|
378
|
+
|
379
|
+
|
380
|
+
## cut off everything before body
|
381
|
+
html = html.sub( /.+?<BODY>\s*/im, '' )
|
382
|
+
|
383
|
+
## cut off everything after body (closing)
|
384
|
+
html = html.sub( /<\/BODY>.*/im, '' )
|
385
|
+
|
386
|
+
html = patch_about( html )
|
387
|
+
|
388
|
+
## remove cite
|
389
|
+
html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
|
390
|
+
puts " remove cite >#{$1}<"
|
391
|
+
"#{$1}"
|
392
|
+
end
|
393
|
+
|
394
|
+
html = html.gsub( /\s*<HR>\s*/im ) do |match|
|
395
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
396
|
+
puts " replace horizontal rule (hr) - >#{match}<"
|
397
|
+
"\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
|
398
|
+
end
|
399
|
+
|
400
|
+
## replace break (br)
|
401
|
+
## note: do NOT use m/multiline for now - why? why not??
|
402
|
+
html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
|
403
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
404
|
+
puts " replace break (br) - >#{match}<"
|
405
|
+
"\n"
|
406
|
+
end
|
407
|
+
|
408
|
+
|
409
|
+
|
410
|
+
html = replace_a_href( html )
|
411
|
+
## note a name="about" includes more a hrefs etc.
|
412
|
+
# let it go first (before a href)
|
413
|
+
html = replace_a_name( html )
|
414
|
+
|
415
|
+
|
416
|
+
|
417
|
+
## replace paragrah (p)
|
418
|
+
html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
|
419
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
420
|
+
puts " replace paragraph (p) - >#{match}<"
|
421
|
+
"\n\n"
|
422
|
+
end
|
423
|
+
html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
|
424
|
+
|
425
|
+
## remove i
|
426
|
+
html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
|
427
|
+
puts " remove italic (i) >#{$1}<"
|
428
|
+
"#{$1}"
|
429
|
+
end
|
430
|
+
|
431
|
+
|
432
|
+
html = replace_h2( html )
|
433
|
+
html = replace_h4( html )
|
434
|
+
|
435
|
+
|
436
|
+
|
437
|
+
|
438
|
+
## remove b - note: might include anchors (thus, call after anchors)
|
439
|
+
html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
|
440
|
+
puts " remove bold (b) >#{$1}<"
|
441
|
+
"**#{$1}**"
|
442
|
+
end
|
443
|
+
|
444
|
+
## replace preformatted (pre)
|
445
|
+
html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
|
446
|
+
puts " replace preformatted (pre)"
|
447
|
+
'' # replace w/ nothing for now (keep surrounding newlines)
|
448
|
+
end
|
449
|
+
|
450
|
+
=begin
|
451
|
+
puts
|
452
|
+
puts
|
453
|
+
puts "html:"
|
454
|
+
puts html[0..2000]
|
455
|
+
puts "-- snip --"
|
456
|
+
puts html[-1000..-1] ## print last hundred chars
|
457
|
+
=end
|
458
|
+
|
459
|
+
|
460
|
+
html = remove_emails( html )
|
461
|
+
|
462
|
+
|
463
|
+
## cleanup whitespaces
|
464
|
+
## todo/fix: convert newline in space first
|
465
|
+
## and than collapse spaces etc.!!!
|
466
|
+
txt = String.new
|
467
|
+
html.each_line do |line|
|
468
|
+
line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
|
469
|
+
line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
|
470
|
+
|
471
|
+
txt << line
|
472
|
+
txt << "\n"
|
473
|
+
end
|
474
|
+
|
475
|
+
txt
|
476
|
+
end # method html_to_text
|
477
|
+
|
478
|
+
|
479
|
+
|
480
|
+
###
|
481
|
+
# more helpers
|
482
|
+
def log( msg )
|
483
|
+
## append msg to ./logs.txt
|
484
|
+
## use ./errors.txt - why? why not?
|
485
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
486
|
+
f.write( msg )
|
487
|
+
f.write( "\n" )
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
|
492
|
+
|
493
|
+
end # module PageConverter
|
494
|
+
end # module Rsssf
|
495
|
+
|