rsssf 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +2 -0
- data/Manifest.txt +3 -6
- data/README.md +43 -26
- data/Rakefile +8 -7
- data/lib/rsssf/convert.rb +495 -0
- data/lib/rsssf/download.rb +151 -0
- data/lib/rsssf/page.rb +70 -45
- data/lib/rsssf/repo.rb +77 -153
- data/lib/rsssf/reports/page.rb +30 -19
- data/lib/rsssf/reports/schedule.rb +111 -25
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +10 -29
- data/lib/rsssf/version.rb +3 -5
- data/lib/rsssf.rb +42 -19
- metadata +26 -25
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 92c74803ad71cb9cac8376ef3e0e01890352fc0a0edb01208eab3a9c41f60767
|
4
|
+
data.tar.gz: 874bdc292143352c88e23b44ed23abb98312d4afd2fd2cc797d078bce1eef0ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cc04f3a78663d870ed8a4d5b77813a1601e49a67fe3ea9265972f8da5b43b88adde78d184bedc320edb29439d7abb6439e6b045181bedea48c6d6be037d1a86
|
7
|
+
data.tar.gz: e92c7acc956eacd756665e83baee9373cf2af422da5deddd51e91e1266d7498e005a8d0b2e88868a23b3929c7b1900ef5bf20c7bd02e260bd07152e3e53d850a
|
data/{HISTORY.md → CHANGELOG.md}
RENAMED
data/Manifest.txt
CHANGED
@@ -1,17 +1,14 @@
|
|
1
|
-
|
1
|
+
CHANGELOG.md
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
5
|
lib/rsssf.rb
|
6
|
-
lib/rsssf/
|
7
|
-
lib/rsssf/
|
6
|
+
lib/rsssf/convert.rb
|
7
|
+
lib/rsssf/download.rb
|
8
8
|
lib/rsssf/page.rb
|
9
|
-
lib/rsssf/patch.rb
|
10
9
|
lib/rsssf/repo.rb
|
11
10
|
lib/rsssf/reports/page.rb
|
12
11
|
lib/rsssf/reports/schedule.rb
|
13
12
|
lib/rsssf/schedule.rb
|
14
13
|
lib/rsssf/utils.rb
|
15
14
|
lib/rsssf/version.rb
|
16
|
-
test/helper.rb
|
17
|
-
test/test_utils.rb
|
data/README.md
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
# rsssf - tools 'n' scripts for RSSSF (Rec.Sport.Soccer Statistics Foundation) archive data
|
2
2
|
|
3
3
|
|
4
|
-
* home :: [github.com/sportdb/
|
5
|
-
* bugs :: [github.com/sportdb/
|
4
|
+
* home :: [github.com/sportdb/sport.db.sources](https://github.com/sportdb/sport.db.sources)
|
5
|
+
* bugs :: [github.com/sportdb/sport.db.sources/issues](https://github.com/sportdb/sport.db.sources/issues)
|
6
6
|
* gem :: [rubygems.org/gems/rsssf](https://rubygems.org/gems/rsssf)
|
7
7
|
* rdoc :: [rubydoc.info/gems/rsssf](http://rubydoc.info/gems/rsssf)
|
8
|
-
* forum :: [opensport](http://groups.google.com/group/opensport)
|
9
8
|
|
10
9
|
|
11
|
-
## What's the Rec.Sport.Soccer Statistics Foundation (RSSSF)?
|
12
10
|
|
13
|
-
The RSSSF collects and offers football (soccer) league tables, match results and more
|
14
|
-
from all over the world online in plain text.
|
15
11
|
|
16
|
-
|
12
|
+
## What's the Rec.Sport.Soccer Statistics Foundation (RSSSF)?
|
13
|
+
|
14
|
+
The RSSSF collects and offers football (soccer) league tables, match results and more
|
15
|
+
from all over the world online in plain text. Example:
|
17
16
|
|
18
17
|
```
|
19
18
|
Round 1
|
@@ -46,15 +45,41 @@ Coritiba 2-1 Atlético/MG
|
|
46
45
|
|
47
46
|
## Usage
|
48
47
|
|
49
|
-
### Working with Pages
|
50
48
|
|
51
|
-
|
49
|
+
### Download (and Cache ) Pages
|
50
|
+
|
51
|
+
To download (and cache) pages from the world wide web use:
|
52
52
|
|
53
53
|
``` ruby
|
54
|
-
|
54
|
+
Rsssf.download_page( 'https://rsssf.org/tablese/eng2024.html',
|
55
|
+
encoding: 'Windows-1252' )
|
56
|
+
|
57
|
+
Rsssf.download_page( 'https://rsssf.org/tablesb/braz2024.html',
|
58
|
+
encoding: 'Windows-1252' )
|
59
|
+
```
|
60
|
+
|
61
|
+
Note: Most pages on rsssf.org use the Windows-1252 (character) encoding.
|
62
|
+
To "auto-magically" convert to unicode (utf-8)
|
63
|
+
add the encoding option (default is `UTF-8`).
|
64
|
+
|
65
|
+
Or as a convenience shortcut download (pre-configured table) pages by country code (e.g `eng` - England, `es` - Spain (España), `de` - Germany (Deutschland), `br` - Brazil (Brasil) etc.)
|
66
|
+
and season (e.g. `2023/24` or `2024` etc.)
|
67
|
+
|
68
|
+
``` ruby
|
69
|
+
Rsssf.download_table( 'eng', season: '2023/24' )
|
70
|
+
|
71
|
+
Rsssf.download_table( 'br', season: '2024' )
|
55
72
|
```
|
56
73
|
|
57
|
-
|
74
|
+
|
75
|
+
Note: The rsssf machinery uses a built-in web cache. All downloads get "auto-magically" cached (in `./cache/rsssf.org`).
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
### Working with Pages
|
80
|
+
|
81
|
+
|
82
|
+
Note: The `RsssfPage` machinery will convert the rsssf archive page
|
58
83
|
from hypertext (HTML) to plain text e.g.
|
59
84
|
|
60
85
|
```
|
@@ -121,7 +146,7 @@ Step 2: Fetch all archive pages
|
|
121
146
|
Use:
|
122
147
|
|
123
148
|
``` ruby
|
124
|
-
repo = RsssfRepo.new( './
|
149
|
+
repo = RsssfRepo.new( './england', title: 'England (and Wales)' )
|
125
150
|
repo.fetch_pages
|
126
151
|
```
|
127
152
|
|
@@ -139,7 +164,7 @@ football.db RSSSF Archive Data Summary for England (and Wales)
|
|
139
164
|
|
140
165
|
_Last Update: 2015-11-26 18:22:22 +0200_
|
141
166
|
|
142
|
-
| Season | File | Authors | Last Updated | Lines (Chars) | Sections |
|
167
|
+
| Season | File | Authors | Last Updated | Lines (Chars) | Sections |
|
143
168
|
| :------ | :------ | :------- | :----------- | ------------: | :------- |
|
144
169
|
| 2014-15 | [eng2015.txt](https://github.com/rsssf/eng-england/blob/master/tables/eng2015.txt) | Ian King and Karel Stokkermans | 4 Jun 2015 | 1249 (34138) | Premier League, Cup Tournaments, Championship, Division 1, Division 2, Conference |
|
145
170
|
| 2013-14 | [eng2014.txt](https://github.com/rsssf/eng-england/blob/master/tables/eng2014.txt) | Ian King and Karel Stokkermans | 5 Feb 2015 | 1254 (34294) | Premier League, Cup Tournaments, Championship, Division 1, Division 2, Conference |
|
@@ -170,23 +195,15 @@ schedule.save( './facup.txt' )
|
|
170
195
|
|
171
196
|
|
172
197
|
|
173
|
-
## Install
|
174
|
-
|
175
|
-
Just install the gem:
|
176
|
-
|
177
|
-
$ gem install rsssf
|
178
|
-
|
179
|
-
|
180
|
-
|
181
198
|
## RSSSF Datasets
|
182
199
|
|
183
200
|
See the rsssf github org for pre-processed ready-to-import datasets. Prepared repos include:
|
184
201
|
|
185
|
-
- [`
|
186
|
-
- [`
|
187
|
-
- [`
|
188
|
-
- [`
|
189
|
-
- [`
|
202
|
+
- [`england`](https://github.com/rsssf/england) - rsssf archive data for England - Premier League, Championship, FA Cup etc.
|
203
|
+
- [`deutschland`](https://github.com/rsssf/deutschland) - rsssf archive data for Germany (Deutschland) - Deutsche Bundesliga, 2. Bundesliga, 3. Liga, DFB Pokal etc.
|
204
|
+
- [`espana`](https://github.com/rsssf/espana) - rsssf archive data for España (Spain) - Primera División / La Liga, Copa de Rey, etc.
|
205
|
+
- [`austria`](https://github.com/rsssf/austria) - rsssf archive data for Austria (Österreich) - Österr. Bundesliga, Erste Liga, ÖFB Pokal etc.
|
206
|
+
- [`brazil`](https://github.com/rsssf/brazil) - rsssf archive data for Brazil (Brasil) - Campeonato Brasileiro Série A / Brasileirão etc.
|
190
207
|
- and more
|
191
208
|
|
192
209
|
|
data/Rakefile
CHANGED
@@ -8,25 +8,26 @@ Hoe.spec 'rsssf' do
|
|
8
8
|
self.summary = "rsssf - tools 'n' scripts for RSSSF (Rec.Sport.Soccer Statistics Foundation) archive data"
|
9
9
|
self.description = summary
|
10
10
|
|
11
|
-
self.urls =
|
11
|
+
self.urls = { home: 'https://github.com/sportdb/sport.db.sources' }
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
14
|
+
self.email = 'gerald.bauer@gmail.com'
|
15
15
|
|
16
16
|
# switch extension to .markdown for gihub formatting
|
17
17
|
self.readme_file = 'README.md'
|
18
|
-
self.history_file = '
|
18
|
+
self.history_file = 'CHANGELOG.md'
|
19
19
|
|
20
20
|
self.extra_deps = [
|
21
|
-
['
|
22
|
-
['
|
23
|
-
['
|
21
|
+
['cocos'],
|
22
|
+
['season-formats'],
|
23
|
+
['rsssf-parser'], ## add rsssf parser machinery & tool
|
24
24
|
]
|
25
25
|
|
26
|
+
|
26
27
|
self.licenses = ['Public Domain']
|
27
28
|
|
28
29
|
self.spec_extras = {
|
29
|
-
required_ruby_version: '>=
|
30
|
+
required_ruby_version: '>= 2.2.2'
|
30
31
|
}
|
31
32
|
|
32
33
|
end
|
@@ -0,0 +1,495 @@
|
|
1
|
+
|
2
|
+
module Rsssf
|
3
|
+
class PageConverter
|
4
|
+
|
5
|
+
## convenience helper
|
6
|
+
def self.convert( html, url: )
|
7
|
+
@@converter ||= new ## use a "shared" built-in converter
|
8
|
+
@@converter.convert( html, url: url )
|
9
|
+
end
|
10
|
+
|
11
|
+
##
|
12
|
+
## add anchor: options or such
|
13
|
+
## lets you toggle adding anchors (§premier etc.) - why? why not?
|
14
|
+
|
15
|
+
def convert( html, url: )
|
16
|
+
### todo/fix: first check if html is all ascii-7bit e.g.
|
17
|
+
## includes only chars from 64 to 127!!!
|
18
|
+
|
19
|
+
## normalize newlines
|
20
|
+
## remove \r (form feed) used by Windows; just use \n (new line)
|
21
|
+
html = html.gsub( "\r", '' )
|
22
|
+
|
23
|
+
## check for html entities
|
24
|
+
html = html.gsub( "ä", 'ä' )
|
25
|
+
html = html.gsub( "ö", 'ö' )
|
26
|
+
html = html.gsub( "ü", 'ü' )
|
27
|
+
html = html.gsub( "Ä", 'Ä' )
|
28
|
+
html = html.gsub( "Ö", 'Ö' )
|
29
|
+
html = html.gsub( "Ü", 'Ü' )
|
30
|
+
html = html.gsub( "ß", 'ß' )
|
31
|
+
|
32
|
+
## typos / autofix - keep - why? why not?
|
33
|
+
html = html.gsub( "&oulm;", 'ö' ) ## support typo in entity (ö)
|
34
|
+
html = html.gsub( "¨", 'ü' ) ## support typo in entity (ü) - why? why not?
|
35
|
+
html = html.gsub( "&slig;", "ß" ) ## support typo in entity (ß)
|
36
|
+
html = html.gsub( "&aaacute;", "á" ) ## typo for á
|
37
|
+
|
38
|
+
|
39
|
+
html = html.gsub( "É", 'É' )
|
40
|
+
html = html.gsub( "ø", 'ø' )
|
41
|
+
html = html.gsub( "ã", 'ã' )
|
42
|
+
html = html.gsub( "õ", 'õ' )
|
43
|
+
html = html.gsub( "ô", 'ô' )
|
44
|
+
|
45
|
+
entities = %w[
|
46
|
+
À À
|
47
|
+
Á Á
|
48
|
+
 Â
|
49
|
+
à Ã
|
50
|
+
Ä Ä
|
51
|
+
Å Å
|
52
|
+
à à
|
53
|
+
á á
|
54
|
+
â â
|
55
|
+
ã ã
|
56
|
+
ä ä
|
57
|
+
å å
|
58
|
+
Æ Æ
|
59
|
+
æ æ
|
60
|
+
ß ß
|
61
|
+
Ç Ç
|
62
|
+
ç ç
|
63
|
+
È È
|
64
|
+
É É
|
65
|
+
Ê Ê
|
66
|
+
Ë Ë
|
67
|
+
è è
|
68
|
+
é é
|
69
|
+
ê ê
|
70
|
+
ë ë
|
71
|
+
Ì Ì
|
72
|
+
Í Í
|
73
|
+
Î Î
|
74
|
+
Ï Ï
|
75
|
+
ì ì
|
76
|
+
í í
|
77
|
+
î î
|
78
|
+
ï ï
|
79
|
+
Ñ Ñ
|
80
|
+
ñ ñ
|
81
|
+
Ò Ò
|
82
|
+
Ó Ó
|
83
|
+
Ô Ô
|
84
|
+
Õ Õ
|
85
|
+
Ö Ö
|
86
|
+
ò ò
|
87
|
+
ó ó
|
88
|
+
ô ô
|
89
|
+
õ õ
|
90
|
+
ö ö
|
91
|
+
Ø Ø
|
92
|
+
ø ø
|
93
|
+
Ù Ù
|
94
|
+
Ú Ú
|
95
|
+
Û Û
|
96
|
+
Ü Ü
|
97
|
+
ù ù
|
98
|
+
ú ú
|
99
|
+
û û
|
100
|
+
ü ü
|
101
|
+
Ý Ý
|
102
|
+
ý ý
|
103
|
+
ÿ ÿ
|
104
|
+
|
105
|
+
< <
|
106
|
+
> >
|
107
|
+
& &
|
108
|
+
© ©
|
109
|
+
® ®
|
110
|
+
|
111
|
+
Š Š
|
112
|
+
š š
|
113
|
+
č č
|
114
|
+
ć ć
|
115
|
+
Ž Ž
|
116
|
+
’ ’
|
117
|
+
]
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
entities.each_slice(2) do |str, entity|
|
122
|
+
html = html.gsub( entity, str )
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
##############
|
128
|
+
## check for more entities
|
129
|
+
## limit &---; to length 10 - why? why not?
|
130
|
+
html = html.gsub( /&[^; ]{1,10};/) do |match|
|
131
|
+
|
132
|
+
match = if match == 'ij' ## use like Van Dijk -> Van Dijk
|
133
|
+
'ij'
|
134
|
+
else
|
135
|
+
msg = "found unencoded html entity #{match}"
|
136
|
+
puts "*** WARN - #{msg}"
|
137
|
+
log( msg ) ## log too (see log.txt)
|
138
|
+
|
139
|
+
match ## pass through as is (1:1)
|
140
|
+
end
|
141
|
+
|
142
|
+
match
|
143
|
+
end
|
144
|
+
## todo/fix: add more entities
|
145
|
+
|
146
|
+
###################################
|
147
|
+
### smart quotes quick fixes
|
148
|
+
### convert all "smart" quote to (standard) single quotes
|
149
|
+
## D´Alessandro => D'Alessandro
|
150
|
+
|
151
|
+
html = html.gsub( '´', "'" )
|
152
|
+
|
153
|
+
html = html.gsub( '’', "'" )
|
154
|
+
html = html.gsub( '‘', "'" )
|
155
|
+
html = html.gsub( '“', '"' )
|
156
|
+
html = html.gsub( '”', '"' )
|
157
|
+
|
158
|
+
### convert fancy dashes/hyphens to plain dash/hyphen
|
159
|
+
html = html.gsub( '–', '-' )
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
txt = html_to_txt( html )
|
164
|
+
|
165
|
+
header = <<EOS
|
166
|
+
<!--
|
167
|
+
source: #{url}
|
168
|
+
-->
|
169
|
+
|
170
|
+
EOS
|
171
|
+
|
172
|
+
header+txt ## return txt w/ header
|
173
|
+
end ## method convert
|
174
|
+
|
175
|
+
|
176
|
+
## todo/fix - use generic heading regex for all h2/h3/h4 etc.
|
177
|
+
## exclude h1 - why? why not?
|
178
|
+
## note - include leading and trailing spaces !!!
|
179
|
+
##
|
180
|
+
## note - for content use non-greedy to allow
|
181
|
+
## match of tags inside content too
|
182
|
+
HEADING2_RE = %r{ \s*
|
183
|
+
<H2>
|
184
|
+
(?<title>.+?)
|
185
|
+
</H2>
|
186
|
+
\s*
|
187
|
+
}imx
|
188
|
+
|
189
|
+
HEADING4_RE = %r{ \s*
|
190
|
+
<H4>
|
191
|
+
(?<title>.+?)
|
192
|
+
</H4>
|
193
|
+
\s*
|
194
|
+
}imx
|
195
|
+
|
196
|
+
def replace_h2( html )
|
197
|
+
html.gsub( HEADING2_RE ) do |_|
|
198
|
+
m = Regexp.last_match
|
199
|
+
puts " replace heading 2 (h2) >#{m[:title]}<"
|
200
|
+
"\n\n## #{m[:title]}\n\n" ## note: make sure to always add two newlines
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def replace_h4( html )
|
205
|
+
html.gsub( HEADING4_RE ) do |_|
|
206
|
+
m = Regexp.last_match
|
207
|
+
puts " replace heading 4 (h4) >#{m[:title]}<"
|
208
|
+
"\n\n#### #{m[:title]}\n\n" ## note: make sure to always add two newlines
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
def squish( str )
|
214
|
+
## squish more than one white space to one space
|
215
|
+
str.gsub( /[ \r\t\n]+/, ' ' )
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
def patch_about( html )
|
220
|
+
# <A name=about>
|
221
|
+
# <H2>About this document</H2></A>
|
222
|
+
# or
|
223
|
+
# <A NAME="about"><H2>About this document</H2></A>
|
224
|
+
# => change to (possible?)
|
225
|
+
# <H2><A name=about>About this document</A></H2>
|
226
|
+
|
227
|
+
html.sub( %r{<A [ ] name=(about|"about")> \s*
|
228
|
+
<H2>About [ ] this [ ] document</H2></A>
|
229
|
+
}ixm,
|
230
|
+
"<H2><A name=about>About this document</A></H2>"
|
231
|
+
)
|
232
|
+
end
|
233
|
+
|
234
|
+
# <a name="sa">Série A</a>
|
235
|
+
# <a name="sd">Série D</a>
|
236
|
+
|
237
|
+
# <A name=about>
|
238
|
+
# <H2>About this document</H2></A>
|
239
|
+
# => change to (possible?)
|
240
|
+
# <H2><A name=about>About this document</A></H2>
|
241
|
+
#
|
242
|
+
#
|
243
|
+
# <h4><a name="cb">Copa do Brasil</a></h4>
|
244
|
+
|
245
|
+
## note - for content use non-greedy to allow
|
246
|
+
## match of tags inside content too
|
247
|
+
|
248
|
+
A_NAME_RE = %r{<A [ ]+ NAME [ ]* =
|
249
|
+
(?<name>[^>]+?)
|
250
|
+
>
|
251
|
+
(?<title>.+?)
|
252
|
+
</A>
|
253
|
+
}imx
|
254
|
+
|
255
|
+
# <a href="#sa">Série A</a><br>
|
256
|
+
#
|
257
|
+
# <A href="http://www.rsssf.org/">Rec.Sport.Soccer
|
258
|
+
# Statistics Foundation</A>
|
259
|
+
# <A href="http://www.rsssfbrasil.com">RSSSF
|
260
|
+
# Brazil</A>
|
261
|
+
#
|
262
|
+
# and Daniel Dalence (<A
|
263
|
+
# href="mailto:danielballack@terra.com.br">danielballack@terra.com.br</A>)
|
264
|
+
|
265
|
+
|
266
|
+
A_HREF_RE = %r{<A \s+ HREF [ ]* =
|
267
|
+
(?<href>[^>]+?)
|
268
|
+
>
|
269
|
+
(?<title>.+?)
|
270
|
+
<\/A>
|
271
|
+
}imx
|
272
|
+
|
273
|
+
|
274
|
+
def replace_a_href( html )
|
275
|
+
## remove anchors (a href)
|
276
|
+
# note: heading 4 includes anchor (thus, let anchors go first)
|
277
|
+
# note: <a \newline href is used for authors email - thus incl. support for newline as space
|
278
|
+
html.gsub( A_HREF_RE ) do |match| ## note: use .+? non-greedy match
|
279
|
+
m = Regexp.last_match
|
280
|
+
href = m[:href].gsub( /["']/, '' ).strip ## remove ("" or '')
|
281
|
+
title = m[:title].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
282
|
+
|
283
|
+
|
284
|
+
## e.g.
|
285
|
+
## ‹Larsen23@gmx.de, see page mailto:Larsen23@gmx.de›
|
286
|
+
## ‹danielballack@terra.com.br, see page mailto:danielballack@terra.com.br›
|
287
|
+
## ‹zja70@aol.com, see page mailto:zja70@aol.com›)
|
288
|
+
if href.start_with?( 'mailto:')
|
289
|
+
puts " blank mailto - anchor (a) href >#{href}, >#{title}<"
|
290
|
+
'‹mailto›' ## delete/remove email
|
291
|
+
else
|
292
|
+
puts " replace anchor (a) href >#{href}, >#{title}<"
|
293
|
+
|
294
|
+
## convert href to xref
|
295
|
+
xref = if href.start_with?('#') ## in-page ref
|
296
|
+
", see §#{href[1..-1]}"
|
297
|
+
elsif href.start_with?( /https?:/ ) ## external page ref
|
298
|
+
## skip - keep empty - why? why not? (or add url domain?)
|
299
|
+
''
|
300
|
+
else
|
301
|
+
## hack - check for some custom excludes
|
302
|
+
if title.start_with?( 'Rec.Sport.Soccer' )
|
303
|
+
## skip - keep empty
|
304
|
+
''
|
305
|
+
else
|
306
|
+
## strip (ending) .htm|html
|
307
|
+
", see page #{href.sub( /\.html?$/,'')}"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
"‹#{squish(title)}#{xref}›"
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
def replace_a_name( html )
|
317
|
+
##
|
318
|
+
## remove (named) anchors
|
319
|
+
html.gsub( A_NAME_RE ) do |match| ## note: use .+? non-greedy match
|
320
|
+
m = Regexp.last_match
|
321
|
+
name = m[:name].gsub( /["']/, '' ).strip ## remove ("" or '')
|
322
|
+
title = m[:title].strip ## note: "save" caputure first; gets replaced by gsub (next regex call)
|
323
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
324
|
+
puts " replace anchor (a) name >#{name}<, >#{title}< - >#{match}<"
|
325
|
+
|
326
|
+
|
327
|
+
##
|
328
|
+
## todo - report WARN if title incl. tags
|
329
|
+
## assumes text only for now - why? why not?
|
330
|
+
## add a name inside heading !!!
|
331
|
+
## do NOT add heading inside a name !!!
|
332
|
+
|
333
|
+
"#{title} ‹§#{name}›" ## note - use two spaces min (between title & name)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
|
338
|
+
EMAIL_RE = %r{ \s*
|
339
|
+
\(
|
340
|
+
[a-z][a-z0-9_]+
|
341
|
+
@[a-z]+(\.[a-z]+)+
|
342
|
+
\)
|
343
|
+
}imx
|
344
|
+
|
345
|
+
|
346
|
+
def remove_emails( html )
|
347
|
+
### remove converted ("blineded") mailto anchors
|
348
|
+
## note usually inside () e.g.
|
349
|
+
## (‹mailto›)
|
350
|
+
## plus slurp up all leading whitespace (incl. newline) - why? why not?
|
351
|
+
html = html.gsub( /\s*
|
352
|
+
\(‹mailto›\)
|
353
|
+
/xm, '' )
|
354
|
+
|
355
|
+
###
|
356
|
+
## remove "regular emails too e.g.
|
357
|
+
##
|
358
|
+
## Thanks to Marcelo Leme de Arruda (___@___.__.br),
|
359
|
+
## Ricardo FF Pontes (___@____.com),
|
360
|
+
## Santiago Reis (____@____.com.br),
|
361
|
+
## Marcos Lacerda Queiroz (___@____.com.br)
|
362
|
+
## etc.
|
363
|
+
|
364
|
+
## check for "free-standing e.g. on its own line" emails only for now
|
365
|
+
html = html.gsub( EMAIL_RE ) do |match|
|
366
|
+
puts "removing email >#{match}<"
|
367
|
+
''
|
368
|
+
end
|
369
|
+
html
|
370
|
+
end
|
371
|
+
|
372
|
+
|
373
|
+
|
374
|
+
def html_to_txt( html )
|
375
|
+
|
376
|
+
###
|
377
|
+
# todo: check if any tags (still) present??
|
378
|
+
|
379
|
+
|
380
|
+
## cut off everything before body
|
381
|
+
html = html.sub( /.+?<BODY>\s*/im, '' )
|
382
|
+
|
383
|
+
## cut off everything after body (closing)
|
384
|
+
html = html.sub( /<\/BODY>.*/im, '' )
|
385
|
+
|
386
|
+
html = patch_about( html )
|
387
|
+
|
388
|
+
## remove cite
|
389
|
+
html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
|
390
|
+
puts " remove cite >#{$1}<"
|
391
|
+
"#{$1}"
|
392
|
+
end
|
393
|
+
|
394
|
+
html = html.gsub( /\s*<HR>\s*/im ) do |match|
|
395
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
396
|
+
puts " replace horizontal rule (hr) - >#{match}<"
|
397
|
+
"\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ## check what hr to use use - . - . - or =-=-=-= or somehting distinct?
|
398
|
+
end
|
399
|
+
|
400
|
+
## replace break (br)
|
401
|
+
## note: do NOT use m/multiline for now - why? why not??
|
402
|
+
html = html.gsub( /<BR>\s*/i ) do |match| ## note: include (swallow) "extra" newline
|
403
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
404
|
+
puts " replace break (br) - >#{match}<"
|
405
|
+
"\n"
|
406
|
+
end
|
407
|
+
|
408
|
+
|
409
|
+
|
410
|
+
html = replace_a_href( html )
|
411
|
+
## note a name="about" includes more a hrefs etc.
|
412
|
+
# let it go first (before a href)
|
413
|
+
html = replace_a_name( html )
|
414
|
+
|
415
|
+
|
416
|
+
|
417
|
+
## replace paragrah (p)
|
418
|
+
html = html.gsub( /\s*<P>\s*/im ) do |match| ## note: include (swallow) "extra" newline
|
419
|
+
match = match.gsub( "\n", '$$' ) ## make newlines visible for debugging
|
420
|
+
puts " replace paragraph (p) - >#{match}<"
|
421
|
+
"\n\n"
|
422
|
+
end
|
423
|
+
html = html.gsub( /<\/P>/i, '' ) ## replace paragraph (p) closing w/ nothing for now
|
424
|
+
|
425
|
+
## remove i
|
426
|
+
html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
|
427
|
+
puts " remove italic (i) >#{$1}<"
|
428
|
+
"#{$1}"
|
429
|
+
end
|
430
|
+
|
431
|
+
|
432
|
+
html = replace_h2( html )
|
433
|
+
html = replace_h4( html )
|
434
|
+
|
435
|
+
|
436
|
+
|
437
|
+
|
438
|
+
## remove b - note: might include anchors (thus, call after anchors)
|
439
|
+
html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
|
440
|
+
puts " remove bold (b) >#{$1}<"
|
441
|
+
"**#{$1}**"
|
442
|
+
end
|
443
|
+
|
444
|
+
## replace preformatted (pre)
|
445
|
+
html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
|
446
|
+
puts " replace preformatted (pre)"
|
447
|
+
'' # replace w/ nothing for now (keep surrounding newlines)
|
448
|
+
end
|
449
|
+
|
450
|
+
=begin
|
451
|
+
puts
|
452
|
+
puts
|
453
|
+
puts "html:"
|
454
|
+
puts html[0..2000]
|
455
|
+
puts "-- snip --"
|
456
|
+
puts html[-1000..-1] ## print last hundred chars
|
457
|
+
=end
|
458
|
+
|
459
|
+
|
460
|
+
html = remove_emails( html )
|
461
|
+
|
462
|
+
|
463
|
+
## cleanup whitespaces
|
464
|
+
## todo/fix: convert newline in space first
|
465
|
+
## and than collapse spaces etc.!!!
|
466
|
+
txt = String.new
|
467
|
+
html.each_line do |line|
|
468
|
+
line = line.gsub( "\t", ' ' ) # replace all tabs w/ two spaces for nwo
|
469
|
+
line = line.rstrip # remove trailing whitespace (incl. newline/formfeed)
|
470
|
+
|
471
|
+
txt << line
|
472
|
+
txt << "\n"
|
473
|
+
end
|
474
|
+
|
475
|
+
txt
|
476
|
+
end # method html_to_text
|
477
|
+
|
478
|
+
|
479
|
+
|
480
|
+
###
|
481
|
+
# more helpers
|
482
|
+
def log( msg )
|
483
|
+
## append msg to ./logs.txt
|
484
|
+
## use ./errors.txt - why? why not?
|
485
|
+
File.open( './logs.txt', 'a:utf-8' ) do |f|
|
486
|
+
f.write( msg )
|
487
|
+
f.write( "\n" )
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
|
492
|
+
|
493
|
+
end # module PageConverter
|
494
|
+
end # module Rsssf
|
495
|
+
|