factbook 0.1.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Manifest.txt +34 -22
- data/README.md +8 -3
- data/Rakefile +2 -263
- data/data/codes.csv +262 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook/builder.rb +214 -0
- data/lib/factbook/builder_item.rb +93 -0
- data/lib/factbook/codes.rb +119 -0
- data/lib/factbook/comparisons.rb +50 -0
- data/lib/factbook/page.rb +103 -303
- data/lib/factbook/sanitizer.rb +214 -0
- data/lib/factbook/sect.rb +29 -196
- data/lib/factbook/subsect.rb +18 -0
- data/lib/factbook/table.rb +52 -0
- data/lib/factbook/utils.rb +85 -0
- data/lib/factbook/utils_info.rb +102 -0
- data/lib/factbook/version.rb +4 -3
- data/lib/factbook.rb +23 -1
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/src/au.html +2006 -0
- data/test/data/src/be.html +2011 -0
- data/test/helper.rb +0 -4
- data/test/test_builder.rb +37 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_fields.rb +21 -18
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +17 -20
- data/test/test_page.rb +18 -10
- data/test/test_sanitizer.rb +35 -0
- metadata +68 -49
- data/.gemtest +0 -0
- data/test/data/countrytemplate_au.html +0 -4179
- data/test/data/countrytemplate_be.html +0 -4260
- data/test/data/countrytemplate_br.html +0 -4366
- data/test/data/countrytemplate_ee.html +0 -2999
- data/test/data/countrytemplate_ls.html +0 -2728
- data/test/data/countrytemplate_mx.html +0 -4397
- data/test/data/countrytemplate_vt.html +0 -1726
- data/test/data/countrytemplate_xx.html +0 -2898
- data/test/test_page_old.rb +0 -478
- data/test/test_strip.rb +0 -66
data/lib/factbook/page.rb
CHANGED
@@ -1,303 +1,103 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
data
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
## note:
|
106
|
-
## skip missing sections (w/ warning)
|
107
|
-
## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
|
108
|
-
|
109
|
-
divs.each_with_index do |rec,i|
|
110
|
-
title = rec[0]
|
111
|
-
div = rec[1]
|
112
|
-
p = html.index( div )
|
113
|
-
if p.nil?
|
114
|
-
## issue warning: if not found
|
115
|
-
logger.warn "***!!! section not found -- #{div} --; skipping"
|
116
|
-
else
|
117
|
-
logger.debug " found section #{i} @ #{p}"
|
118
|
-
indexes << [title,p]
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
@sects = []
|
123
|
-
|
124
|
-
indexes.each_with_index do |rec,i|
|
125
|
-
title = rec[0]
|
126
|
-
from = rec[1]
|
127
|
-
|
128
|
-
# is last entry? if yes use -1 otherewise pos
|
129
|
-
# note: subtract one (-1) from pos unless end-of-string (-1)
|
130
|
-
to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
|
131
|
-
|
132
|
-
## todo: check that from is smaller than to
|
133
|
-
logger.debug " cut section #{i} [#{from}..#{to}]"
|
134
|
-
@sects << Sect.new( title, html[ from..to ], @opts )
|
135
|
-
|
136
|
-
##if i==0 || i==1
|
137
|
-
## puts "debug sect #{i}:"
|
138
|
-
## puts ">>>|||#{html[ from..to ]}|||<<<"
|
139
|
-
##end
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
@sects
|
144
|
-
end
|
145
|
-
|
146
|
-
def html=(html)
|
147
|
-
## for debugging n testing
|
148
|
-
## lets you set html (no need to fetch via net)
|
149
|
-
@html = html
|
150
|
-
end
|
151
|
-
|
152
|
-
def html
|
153
|
-
if @html.nil?
|
154
|
-
@html = fetch()
|
155
|
-
|
156
|
-
### remove everything up to
|
157
|
-
## <div id="countryInfo" style="display: none;">
|
158
|
-
## remove everything starting w/ footer
|
159
|
-
## remove head !!!
|
160
|
-
## in body remove header n footer
|
161
|
-
|
162
|
-
## remove inline script
|
163
|
-
@html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
|
164
|
-
puts "remove script:"
|
165
|
-
puts "#{m}"
|
166
|
-
''
|
167
|
-
end
|
168
|
-
|
169
|
-
## remove inline style
|
170
|
-
@html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
|
171
|
-
puts "remove style:"
|
172
|
-
puts "#{m}"
|
173
|
-
''
|
174
|
-
end
|
175
|
-
|
176
|
-
## remove link
|
177
|
-
link_regex = /<link[^>]+>/
|
178
|
-
@html = @html.gsub( link_regex ) do |m|
|
179
|
-
puts "remove link:"
|
180
|
-
puts "#{m}"
|
181
|
-
''
|
182
|
-
end
|
183
|
-
|
184
|
-
div_country_info_regex = /<div id="countryInfo"\s*>/
|
185
|
-
## remove everything before <div id="countryInfo" >
|
186
|
-
pos = @html.index( div_country_info_regex )
|
187
|
-
if pos # not nil, false
|
188
|
-
@html = @html[pos..-1]
|
189
|
-
end
|
190
|
-
|
191
|
-
## remove country comparison
|
192
|
-
## e.g. <span class="category" >country comparison to the world:</span>
|
193
|
-
## <span class="category_data">
|
194
|
-
## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
|
195
|
-
## 5
|
196
|
-
## </a>
|
197
|
-
## </span>
|
198
|
-
|
199
|
-
##
|
200
|
-
##
|
201
|
-
## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
|
202
|
-
## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
|
203
|
-
##
|
204
|
-
|
205
|
-
country_comparison_regex = /
|
206
|
-
<span \s class="category"[^>]*>
|
207
|
-
country \s comparison \s to \s the \s world:
|
208
|
-
<\/span>
|
209
|
-
\s*
|
210
|
-
<span \s class="category_data"[^>]*>
|
211
|
-
\s*
|
212
|
-
<a \s [^>]+>
|
213
|
-
.+?
|
214
|
-
<\/a>
|
215
|
-
\s*
|
216
|
-
<\/span>
|
217
|
-
/xm
|
218
|
-
|
219
|
-
@html = @html.gsub( country_comparison_regex ) do |m|
|
220
|
-
puts "remove country comparison:"
|
221
|
-
puts "#{m}"
|
222
|
-
''
|
223
|
-
end
|
224
|
-
|
225
|
-
style_attr_regex = /\s*style="[^"]+"/
|
226
|
-
@html = @html.gsub( style_attr_regex ) do |m|
|
227
|
-
puts "remove style attr:"
|
228
|
-
puts "#{m}"
|
229
|
-
''
|
230
|
-
end
|
231
|
-
|
232
|
-
## <tr height="22">
|
233
|
-
## <td class="category_data"></td>
|
234
|
-
## </tr>
|
235
|
-
tr_empty_regex = /
|
236
|
-
<tr[^>]*>
|
237
|
-
\s*
|
238
|
-
<td[^>]*> \s* <\/td>
|
239
|
-
\s*
|
240
|
-
<\/tr>
|
241
|
-
/xm
|
242
|
-
@html = @html.gsub( tr_empty_regex ) do |m|
|
243
|
-
puts "remove tr emtpy:"
|
244
|
-
puts "#{m}"
|
245
|
-
''
|
246
|
-
end
|
247
|
-
|
248
|
-
## remove world leader website promo
|
249
|
-
## <span class="category">(For more information visit the
|
250
|
-
## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>
|
251
|
-
## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
|
252
|
-
## </span>
|
253
|
-
world_leaders_website_regex = /
|
254
|
-
<span \s class="category"[^>]*>
|
255
|
-
\(
|
256
|
-
For \s more \s information \s
|
257
|
-
.+? ## non-greedy (smallest possible match
|
258
|
-
\)
|
259
|
-
<\/span>
|
260
|
-
/xm
|
261
|
-
@html = @html.gsub( world_leaders_website_regex ) do |m|
|
262
|
-
puts "remove world leader website promo:"
|
263
|
-
puts "#{m}"
|
264
|
-
''
|
265
|
-
end
|
266
|
-
|
267
|
-
end
|
268
|
-
@html
|
269
|
-
end
|
270
|
-
|
271
|
-
private
|
272
|
-
def fetch
|
273
|
-
uri_string = SITE_BASE.gsub( '{code}', @code )
|
274
|
-
|
275
|
-
worker = Fetcher::Worker.new
|
276
|
-
response = worker.get_response( uri_string )
|
277
|
-
|
278
|
-
if response.code == '200'
|
279
|
-
t = response.body
|
280
|
-
###
|
281
|
-
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
282
|
-
# will mostly be ASCII
|
283
|
-
# - try to change encoding to UTF-8 ourselves
|
284
|
-
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
285
|
-
#####
|
286
|
-
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
287
|
-
|
288
|
-
## NB:
|
289
|
-
# for now "hardcoded" to utf8 - what else can we do?
|
290
|
-
# - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
|
291
|
-
t = t.force_encoding( Encoding::UTF_8 )
|
292
|
-
logger.debug "t.encoding.name (after): #{t.encoding.name}"
|
293
|
-
## pp t
|
294
|
-
t
|
295
|
-
else
|
296
|
-
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
297
|
-
nil
|
298
|
-
end
|
299
|
-
end
|
300
|
-
|
301
|
-
end # class Page
|
302
|
-
|
303
|
-
end # module Factbook
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
## note:
|
7
|
+
## some factbook pages with chrome (headers, footers, etc.)
|
8
|
+
## are NOT valid utf-8, thus,
|
9
|
+
## treat page as is (e.g. ASCII8BIT)
|
10
|
+
#
|
11
|
+
# only convert to utf8 when header and footer got stripped
|
12
|
+
|
13
|
+
##
|
14
|
+
## be/benin:
|
15
|
+
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
16
|
+
#
|
17
|
+
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
18
|
+
# Lazare Sèhouéto
|
19
|
+
#
|
20
|
+
# looks good - use (assume) Windows-1252 ????
|
21
|
+
|
22
|
+
##
|
23
|
+
# check for is ascii 7-bit ??? if yes -noworries
|
24
|
+
# if not, log number of chars not using ascii 7-bit
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
class Page
|
29
|
+
include LogUtils::Logging
|
30
|
+
|
31
|
+
attr_accessor :sects
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
@sects = []
|
35
|
+
end
|
36
|
+
|
37
|
+
def [](key) ### convenience shortcut
|
38
|
+
# lets you use
|
39
|
+
# page['geo']
|
40
|
+
# instead of
|
41
|
+
# page.data['geo']
|
42
|
+
|
43
|
+
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
44
|
+
|
45
|
+
data[key]
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
def data
|
50
|
+
## note: cache data hash on first build for now
|
51
|
+
if @data.nil?
|
52
|
+
## convert sects to hash
|
53
|
+
@data = {}
|
54
|
+
|
55
|
+
sects.each_with_index do |sect,i|
|
56
|
+
@data[ sect.title ] = sect.data
|
57
|
+
end
|
58
|
+
end
|
59
|
+
@data
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
=begin
|
64
|
+
def self.from_url( cc, cn )
|
65
|
+
html_ascii = PageFetcher.new.fetch( cc )
|
66
|
+
self.new( cc, cn, html_ascii )
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.from_file( cc, cn, opts={} )
|
70
|
+
input_dir = opts[:input_dir] || '.'
|
71
|
+
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
72
|
+
self.new( cc, cn, html_ascii )
|
73
|
+
end
|
74
|
+
=end
|
75
|
+
|
76
|
+
|
77
|
+
end # class Page
|
78
|
+
|
79
|
+
|
80
|
+
=begin
|
81
|
+
class PageFetcher
|
82
|
+
|
83
|
+
def fetch( cc )
|
84
|
+
worker = Fetcher::Worker.new
|
85
|
+
factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
|
86
|
+
|
87
|
+
res = worker.get_response( "#{factbook_base}/#{cc}.html" )
|
88
|
+
|
89
|
+
# on error throw exception - why? why not??
|
90
|
+
if res.code != '200'
|
91
|
+
raise Fetcher::HttpError.new( res.code, res.message )
|
92
|
+
end
|
93
|
+
|
94
|
+
###
|
95
|
+
# Note: Net::HTTP will NOT set encoding UTF-8 etc.
|
96
|
+
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
97
|
+
html = res.body.to_s
|
98
|
+
end
|
99
|
+
end # PageFetcher
|
100
|
+
=end
|
101
|
+
|
102
|
+
|
103
|
+
end # module Factbook
|
@@ -0,0 +1,214 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
PageInfo = Struct.new( :country_code,
|
6
|
+
:country_name,
|
7
|
+
:country_affiliation,
|
8
|
+
:region_code,
|
9
|
+
:region_name,
|
10
|
+
:last_updated )
|
11
|
+
|
12
|
+
class Sanitizer
|
13
|
+
include LogUtils::Logging
|
14
|
+
include Utils ## pulls in encode_utf8, ...
|
15
|
+
|
16
|
+
|
17
|
+
def sanitize( html_ascii )
|
18
|
+
## todo: add option for (html source) encoding - why?? why not??
|
19
|
+
|
20
|
+
## note:
|
21
|
+
## returns 1) html profile withouth headers, footers, scripts,etc.
|
22
|
+
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
23
|
+
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
24
|
+
|
25
|
+
page_info = PageInfo.new
|
26
|
+
|
27
|
+
h = find_page_info( html_ascii )
|
28
|
+
page_info.country_code = h[:country_code]
|
29
|
+
page_info.country_name = h[:country_name]
|
30
|
+
page_info.country_affiliation = h[:country_affiliation]
|
31
|
+
page_info.region_code = h[:region_code]
|
32
|
+
page_info.region_name = h[:region_name]
|
33
|
+
|
34
|
+
page_info.last_updated = find_page_last_updated( html_ascii )
|
35
|
+
|
36
|
+
|
37
|
+
html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
38
|
+
|
39
|
+
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
40
|
+
html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
41
|
+
|
42
|
+
html = sanitize_profile( html )
|
43
|
+
|
44
|
+
[html, page_info, errors]
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
BEGIN_FACTS_REGEX = /<ul\s+
|
50
|
+
class="expandcollapse">
|
51
|
+
/xim ## ignore case; multi-line
|
52
|
+
|
53
|
+
END_FACTS_REGEX = /<\/li>\s*
|
54
|
+
<\/ul>\s*
|
55
|
+
<\/tbody>\s*
|
56
|
+
<\/table>
|
57
|
+
/xim ## ignore case; multi-line
|
58
|
+
|
59
|
+
|
60
|
+
def find_country_profile( html )
|
61
|
+
####
|
62
|
+
## remove header (everything before)
|
63
|
+
## <ul class="expandcollapse">
|
64
|
+
|
65
|
+
pos = html.index( BEGIN_FACTS_REGEX )
|
66
|
+
fail "*** no begin facts marker found for page" if pos.nil?
|
67
|
+
|
68
|
+
puts " bingo - found BEGIN_FACTS on pos #{pos}"
|
69
|
+
html = html[pos..-1]
|
70
|
+
|
71
|
+
pp html[0..100]
|
72
|
+
|
73
|
+
###
|
74
|
+
## remove footer
|
75
|
+
## assume everthings after (last list item in unorder list inside a table body)
|
76
|
+
## </li>
|
77
|
+
## </ul>
|
78
|
+
## </tbody></table>
|
79
|
+
|
80
|
+
pos = html.index( END_FACTS_REGEX )
|
81
|
+
fail "*** no end facts marker found for page" if pos.nil?
|
82
|
+
|
83
|
+
puts " bingo - found END_FACTS on pos #{pos}"
|
84
|
+
html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
|
85
|
+
|
86
|
+
pp html[-200..-1]
|
87
|
+
html
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
|
92
|
+
STYLE_ATTR_REGEX = /\s*
|
93
|
+
style=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
94
|
+
/xim ## do NOT allow multi-line - why? why not?
|
95
|
+
|
96
|
+
CLASS_ATTR_REGEX = /\s*
|
97
|
+
class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
|
98
|
+
/xim ## do NOT allow multi-line - why? why not?
|
99
|
+
|
100
|
+
##
|
101
|
+
## <div>
|
102
|
+
## <span class='category'>country comparison to the world: </span>
|
103
|
+
## <span class='category_data'>[[191]]</span>
|
104
|
+
## </div>
|
105
|
+
##
|
106
|
+
## <span class='category'>country comparison to the world: </span>
|
107
|
+
## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
|
108
|
+
|
109
|
+
|
110
|
+
## todo: add enclosing div too!!!
|
111
|
+
|
112
|
+
COUNTRY_COMPARISON_REGEX = /
|
113
|
+
<div>
|
114
|
+
<span \s class='category'[^>]*>
|
115
|
+
country \s comparison \s to \s the \s world: \s*
|
116
|
+
<\/span>
|
117
|
+
\s*
|
118
|
+
<span \s class='category_data'[^>]*>
|
119
|
+
\s*
|
120
|
+
<a \s [^>]+>
|
121
|
+
.+?
|
122
|
+
<\/a>
|
123
|
+
\s*
|
124
|
+
<\/span>
|
125
|
+
<\/div>
|
126
|
+
/xim
|
127
|
+
|
128
|
+
##
|
129
|
+
## <div class='wrap'>
|
130
|
+
## <div class='audio-player'>
|
131
|
+
## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
|
132
|
+
## </audio>
|
133
|
+
## </div></div>
|
134
|
+
|
135
|
+
|
136
|
+
AUDIO_PLAYER_REGEX = /
|
137
|
+
<div \s class='wrap'>
|
138
|
+
<div \s class='audio-player'>
|
139
|
+
<audio \s [^>]+>
|
140
|
+
<\/audio>
|
141
|
+
<\/div>
|
142
|
+
<\/div>
|
143
|
+
/xim
|
144
|
+
|
145
|
+
def sanitize_profile( html )
|
146
|
+
|
147
|
+
html = html.gsub( STYLE_ATTR_REGEX ) do |m|
|
148
|
+
puts "remove style attr:"
|
149
|
+
puts "#{m}"
|
150
|
+
''
|
151
|
+
end
|
152
|
+
|
153
|
+
html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
|
154
|
+
puts "remove audio player:"
|
155
|
+
puts "#{m}"
|
156
|
+
''
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
|
161
|
+
puts "remove country comparison:"
|
162
|
+
puts "#{m}"
|
163
|
+
''
|
164
|
+
end
|
165
|
+
|
166
|
+
## remove/cleanup anchors (a href)
|
167
|
+
html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
|
168
|
+
puts " replace anchor (a) href >#{$1}<"
|
169
|
+
|
170
|
+
inner_text = $1.dup ## keep a copy
|
171
|
+
if inner_text =~ /<img/ ## if includes image remove
|
172
|
+
puts " remove image in anchor"
|
173
|
+
''
|
174
|
+
else ## keep inner text
|
175
|
+
inner_text
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
## remove all list e.g. ul/li
|
181
|
+
html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
|
182
|
+
puts " remove list >#{m}<"
|
183
|
+
''
|
184
|
+
end
|
185
|
+
|
186
|
+
## clean-up class attrib e.g. remove unknown classes
|
187
|
+
html = html.gsub( CLASS_ATTR_REGEX ) do |m|
|
188
|
+
puts "cleanup class attr:"
|
189
|
+
puts "#{m}"
|
190
|
+
|
191
|
+
klasses = $2.split(' ')
|
192
|
+
klasses = klasses.select do |klass|
|
193
|
+
if ['region', 'category', 'category_data'].include?( klass )
|
194
|
+
true
|
195
|
+
else
|
196
|
+
puts " remove class #{klass}"
|
197
|
+
false
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
if klasses.size > 0
|
202
|
+
" class='#{klasses.join(' ')}'" ## note: add leading space!!
|
203
|
+
else
|
204
|
+
'' ## remove class attrib completely
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
html
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
end # class Sanitizer
|
213
|
+
|
214
|
+
end # module Factbook
|