factbook 1.2.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -1
- data/README.md +548 -543
- data/Rakefile +34 -33
- data/data/codes.csv +262 -262
- data/data/codesxref.csv +280 -280
- data/lib/factbook.rb +68 -75
- data/lib/factbook/builder.rb +14 -3
- data/lib/factbook/builder_item.rb +93 -59
- data/lib/factbook/page.rb +20 -57
- data/lib/factbook/sanitizer.rb +98 -285
- data/lib/factbook/version.rb +21 -22
- data/script/json.rb +3 -2
- data/test/data/src/au.html +658 -658
- data/test/data/src/be.html +648 -648
- data/test/helper.rb +11 -11
- data/test/test_fields.rb +52 -52
- data/test/test_json.rb +45 -45
- data/test/test_page.rb +38 -38
- metadata +31 -11
data/lib/factbook/sanitizer.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -37,52 +36,24 @@ def sanitize( html_ascii )
|
|
37
36
|
page_info.last_updated = find_page_last_updated( html_ascii )
|
38
37
|
|
39
38
|
|
40
|
-
|
39
|
+
html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
41
40
|
|
42
41
|
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
43
|
-
html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
42
|
+
# html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
44
43
|
|
45
|
-
html = sanitize_profile( html )
|
44
|
+
# html = sanitize_profile( html )
|
46
45
|
|
47
|
-
[html, page_info,
|
46
|
+
[html, page_info, []]
|
48
47
|
end
|
49
48
|
|
50
49
|
|
51
|
-
|
52
|
-
|
53
|
-
#
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
class="expandcollapse">
|
59
|
-
/xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
|
60
|
-
|
61
|
-
|
62
|
-
####
|
63
|
-
# example match:
|
64
|
-
#
|
65
|
-
# ++ 2016 Nov/3:
|
66
|
-
#
|
67
|
-
# </li>
|
68
|
-
# </ul>
|
69
|
-
# <!-- end generated content -->
|
70
|
-
#
|
71
|
-
### todo: just use first match of </li></ul> - why? why not?
|
72
|
-
#
|
73
|
-
# history/changes:
|
74
|
-
# ++ 2015 Sept/24 (for regex see attic):
|
75
|
-
#
|
76
|
-
# </li>
|
77
|
-
# </ul>
|
78
|
-
# </tbody></table>
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
END_FACTS_REGEX = /<\/li> \s*
|
83
|
-
<\/ul> \s*
|
84
|
-
<!-- \s end \s generated \s content \s -->
|
85
|
-
/xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
|
50
|
+
#
|
51
|
+
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
52
|
+
#
|
53
|
+
# remove aria labels
|
54
|
+
ARIA_ATTR_REGEX = /\s*
|
55
|
+
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
56
|
+
/xim ## do NOT allow multi-line - why? why not?
|
86
57
|
|
87
58
|
|
88
59
|
def find_country_profile( html )
|
@@ -90,273 +61,115 @@ def find_country_profile( html )
|
|
90
61
|
## remove header (everything before)
|
91
62
|
## <ul class="expandcollapse">
|
92
63
|
|
93
|
-
|
94
|
-
fail "*** no begin facts marker found for page" if pos.nil?
|
64
|
+
doc = Nokogiri::HTML( html )
|
95
65
|
|
96
|
-
|
97
|
-
html = html[pos..-1]
|
66
|
+
ul = doc.css( 'ul.expandcollapse' )[0]
|
98
67
|
|
99
|
-
|
68
|
+
puts ul.to_html[0..100]
|
100
69
|
|
101
|
-
###
|
102
|
-
## remove footer
|
103
|
-
## assume everthings after (last list item in unorder list inside a table body)
|
104
|
-
## </li>
|
105
|
-
## </ul>
|
106
|
-
## </tbody></table>
|
107
70
|
|
108
|
-
pos = html.index( END_FACTS_REGEX )
|
109
|
-
fail "*** no end facts marker found for page" if pos.nil?
|
110
71
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
72
|
+
## note: special case cc uses h2 instead of div block
|
73
|
+
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
74
|
+
## style="border-bottom: 2px solid white; cursor: pointer;">
|
75
|
+
## Introduction :: <span class="region">CURACAO </span>
|
76
|
+
## </h2>
|
77
|
+
## is old format !!!!
|
78
|
+
## cc - CURACAO
|
79
|
+
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
80
|
+
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
81
|
+
## wait for new version to be generated / pushed!!!
|
117
82
|
|
83
|
+
## check for old format if h2 are present
|
84
|
+
h2s = ul.css( 'h2' )
|
85
|
+
if h2s.size > 0
|
86
|
+
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
87
|
+
## return empty html string - why? why not?
|
88
|
+
return ''
|
89
|
+
end
|
118
90
|
|
119
91
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
##
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
<\/span>
|
145
|
-
\s*
|
146
|
-
<span \s class='category_data'[^>]*>
|
147
|
-
\s*
|
148
|
-
<a \s [^>]+>
|
149
|
-
.+?
|
150
|
-
<\/a>
|
151
|
-
\s*
|
152
|
-
<\/span>
|
153
|
-
<\/div>
|
154
|
-
/xim
|
155
|
-
|
156
|
-
##
|
157
|
-
## <div class='wrap'>
|
158
|
-
## <div class='audio-player'>
|
159
|
-
## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
|
160
|
-
## </audio>
|
161
|
-
## </div></div>
|
162
|
-
|
163
|
-
|
164
|
-
AUDIO_PLAYER_REGEX = /
|
165
|
-
<div \s class='wrap'>
|
166
|
-
<div \s class='audio-player'>
|
167
|
-
<audio \s [^>]+>
|
168
|
-
<\/audio>
|
169
|
-
<\/div>
|
170
|
-
<\/div>
|
171
|
-
/xim
|
172
|
-
|
173
|
-
|
174
|
-
## remove category => Area comparison map:
|
175
|
-
##
|
176
|
-
## <div class='disTable areaComp'
|
177
|
-
## ...
|
178
|
-
## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
|
179
|
-
|
180
|
-
AREA_COMP_CATEGORY_REGEX = /
|
181
|
-
<div \s class='disTable \s areaComp'
|
182
|
-
.+?
|
183
|
-
(?=<div \s id='field')
|
184
|
-
/xim
|
185
|
-
|
186
|
-
|
187
|
-
## remove category => population pyramid:
|
188
|
-
##
|
189
|
-
## <div class='disTable popPyramid'>
|
190
|
-
## ...
|
191
|
-
## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
|
192
|
-
|
193
|
-
POP_PYRAMID_CATEGORY_REGEX = /
|
194
|
-
<div \s class='disTable \s popPyramid'
|
195
|
-
.+?
|
196
|
-
(?=<div \s id='field')
|
197
|
-
/xim
|
198
|
-
|
199
|
-
## remove category => religious affiliation:
|
200
|
-
##
|
201
|
-
## <div class='disTable relAffiliation'>
|
202
|
-
## ...
|
203
|
-
## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
|
204
|
-
|
205
|
-
REL_AFFILIATION_CATEGORY_REGEX = /
|
206
|
-
<div \s class='disTable \s relAffiliation'
|
207
|
-
.+?
|
208
|
-
(?=<div \s id='field')
|
209
|
-
/xim
|
210
|
-
|
211
|
-
|
212
|
-
##########################################
|
213
|
-
## transforms / simplify
|
214
|
-
##
|
215
|
-
## <h2 sectiontitle='Introduction' ccode='ag'>
|
216
|
-
## Introduction :: <span class='region'>ALGERIA </span>
|
217
|
-
## </h2>
|
218
|
-
## becomes =>
|
219
|
-
## <h2>Introduction</h2>
|
220
|
-
##
|
221
|
-
## todo/fix: use named capture in future e.g.
|
222
|
-
## (?<text>.+?) instead of (.+?)
|
223
|
-
## not working for now w/ gsub (just passed in match string NOT match data)
|
224
|
-
|
225
|
-
CLEANUP_SECTION_REGEX = /
|
226
|
-
<h2 [^>]*>
|
227
|
-
(.+?)
|
228
|
-
<\/h2>
|
229
|
-
/xim
|
230
|
-
|
231
|
-
##
|
232
|
-
## <div id='field' class='category'>Electricity - consumption:</div>
|
233
|
-
## becomes =>
|
234
|
-
## <h3>Electricity - consumption:</h3>
|
235
|
-
|
236
|
-
CLEANUP_SUBSECTION_REGEX = /
|
237
|
-
<div \s id='field' [^>]*>
|
238
|
-
(.+?)
|
239
|
-
<\/div>
|
240
|
-
/xim
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
def sanitize_profile( html )
|
245
|
-
|
246
|
-
## remove categories w/ visualizations/graphics only e.g.
|
247
|
-
## - area comparions map
|
248
|
-
## - population pyramid
|
249
|
-
## - religious affiliation
|
250
|
-
|
251
|
-
html = html.gsub( AREA_COMP_CATEGORY_REGEX ) do |m|
|
252
|
-
puts "remove category => area comparison map:"
|
253
|
-
puts "#{m}"
|
254
|
-
''
|
255
|
-
end
|
256
|
-
|
257
|
-
html = html.gsub( POP_PYRAMID_CATEGORY_REGEX ) do |m|
|
258
|
-
puts "remove category => population pyramid:"
|
259
|
-
puts "#{m}"
|
260
|
-
''
|
261
|
-
end
|
262
|
-
|
263
|
-
html = html.gsub( REL_AFFILIATION_CATEGORY_REGEX ) do |m|
|
264
|
-
puts "remove category => religious affiliation:"
|
265
|
-
puts "#{m}"
|
266
|
-
''
|
267
|
-
end
|
268
|
-
|
269
|
-
################################################
|
270
|
-
## more - let's get started
|
271
|
-
|
272
|
-
html = html.gsub( STYLE_ATTR_REGEX ) do |m|
|
273
|
-
puts "remove style attr:"
|
274
|
-
puts "#{m}"
|
275
|
-
''
|
276
|
-
end
|
277
|
-
|
278
|
-
html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
|
279
|
-
puts "remove audio player:"
|
280
|
-
puts "#{m}"
|
281
|
-
''
|
282
|
-
end
|
283
|
-
|
284
|
-
html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
|
285
|
-
puts "remove country comparison:"
|
286
|
-
puts "#{m}"
|
287
|
-
''
|
288
|
-
end
|
289
|
-
|
290
|
-
## remove/cleanup anchors (a href)
|
291
|
-
html = html.gsub( /<a\s+[^>]+>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
|
292
|
-
puts " replace anchor (a) >#{$1}<"
|
293
|
-
|
294
|
-
inner_text = $1.dup ## keep a copy
|
295
|
-
if inner_text =~ /<img/ ## if includes image remove
|
296
|
-
puts " remove image in anchor"
|
297
|
-
''
|
298
|
-
else ## keep inner text
|
299
|
-
inner_text
|
92
|
+
###
|
93
|
+
## sanitize
|
94
|
+
|
95
|
+
## remove link items
|
96
|
+
## assume two <li>s are a section
|
97
|
+
|
98
|
+
html = String.new('')
|
99
|
+
|
100
|
+
## filter all li's
|
101
|
+
ul_children = ul.children.select { |el| if el.name == 'li'
|
102
|
+
true
|
103
|
+
else
|
104
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
105
|
+
false
|
106
|
+
end
|
107
|
+
}
|
108
|
+
puts " #{ul_children.size} li(s):"
|
109
|
+
ul_children.each_slice(2) do |lis|
|
110
|
+
li = lis[0]
|
111
|
+
div = li.at( 'div[sectiontitle]' )
|
112
|
+
if div.nil?
|
113
|
+
puts "!! ERROR: no section title found in div:"
|
114
|
+
puts li.to_html
|
115
|
+
exit 1
|
300
116
|
end
|
301
|
-
end
|
302
117
|
|
118
|
+
section_title = div['sectiontitle'].to_s
|
303
119
|
|
304
|
-
|
305
|
-
html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
|
306
|
-
puts " remove list >#{m}<"
|
307
|
-
''
|
308
|
-
end
|
120
|
+
html << "<h2>#{section_title}</h2>\n"
|
309
121
|
|
310
|
-
## clean-up class attrib e.g. remove unknown classes
|
311
|
-
html = html.gsub( CLASS_ATTR_REGEX ) do |m|
|
312
|
-
puts "cleanup class attr:"
|
313
|
-
puts "#{m}"
|
314
|
-
|
315
|
-
klasses = $2.split(' ')
|
316
|
-
klasses = klasses.select do |klass|
|
317
|
-
if ['category', 'category_data'].include?( klass )
|
318
|
-
true
|
319
|
-
else
|
320
|
-
puts " remove class #{klass}"
|
321
|
-
false
|
322
|
-
end
|
323
|
-
end
|
324
122
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
123
|
+
li = lis[1]
|
124
|
+
## filter all div's
|
125
|
+
li_children = li.children.select { |el| if el.name =='div'
|
126
|
+
true
|
127
|
+
else
|
128
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
129
|
+
false
|
130
|
+
end
|
131
|
+
}
|
132
|
+
puts " #{li_children.size} div(s):"
|
331
133
|
|
134
|
+
li_children.each_slice(2) do |divs|
|
135
|
+
div = divs[0]
|
136
|
+
a = div.css('a')[0]
|
332
137
|
|
333
|
-
|
334
|
-
|
138
|
+
if a
|
139
|
+
html << "\n<h3>#{a.text}:</h3>\n"
|
140
|
+
else
|
141
|
+
puts "!! WARN: no anchor found:"
|
142
|
+
puts div.to_html
|
143
|
+
end
|
335
144
|
|
336
|
-
html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
|
337
|
-
puts " cleanup section (h2) heading >#{$1}<"
|
338
145
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
text = text[0...pos]
|
344
|
-
end
|
345
|
-
text = text.strip # remove trailing space too
|
146
|
+
div = divs[1]
|
147
|
+
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
148
|
+
div_children.each do |catdiv|
|
149
|
+
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
346
150
|
|
347
|
-
|
151
|
+
if catdiv['class'].index( 'attachment' )
|
152
|
+
## skip attachments e.g. maps, pop pyramids, etc.
|
153
|
+
else
|
154
|
+
html << catdiv.to_html
|
155
|
+
html << "\n"
|
156
|
+
end
|
157
|
+
else
|
158
|
+
puts "!! WARN: skipping div (W/O category_data class):"
|
159
|
+
puts catdiv.to_html
|
160
|
+
end
|
161
|
+
end
|
348
162
|
end
|
163
|
+
end
|
349
164
|
|
350
|
-
html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
|
351
|
-
puts " cleanup subsection (h3) heading >#{$1}<"
|
352
|
-
|
353
|
-
text = $1
|
354
|
-
text = text.strip # remove trailing space too
|
355
165
|
|
356
|
-
|
357
|
-
|
166
|
+
html = html.gsub( ARIA_ATTR_REGEX ) do |m|
|
167
|
+
puts "remove aria-label attr:"
|
168
|
+
puts "#{m}"
|
169
|
+
''
|
170
|
+
end
|
358
171
|
|
359
|
-
|
172
|
+
html
|
360
173
|
end
|
361
174
|
|
362
175
|
|
data/lib/factbook/version.rb
CHANGED
@@ -1,22 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
1
|
+
|
2
|
+
module Factbook
|
3
|
+
|
4
|
+
MAJOR = 2
|
5
|
+
MINOR = 0
|
6
|
+
PATCH = 0
|
7
|
+
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
8
|
+
|
9
|
+
def self.version
|
10
|
+
VERSION
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.banner
|
14
|
+
"factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.root
|
18
|
+
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|