factbook 1.2.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -1
- data/README.md +548 -543
- data/Rakefile +34 -33
- data/data/codes.csv +262 -262
- data/data/codesxref.csv +280 -280
- data/lib/factbook.rb +68 -75
- data/lib/factbook/builder.rb +14 -3
- data/lib/factbook/builder_item.rb +93 -59
- data/lib/factbook/page.rb +20 -57
- data/lib/factbook/sanitizer.rb +98 -285
- data/lib/factbook/version.rb +21 -22
- data/script/json.rb +3 -2
- data/test/data/src/au.html +658 -658
- data/test/data/src/be.html +648 -648
- data/test/helper.rb +11 -11
- data/test/test_fields.rb +52 -52
- data/test/test_json.rb +45 -45
- data/test/test_page.rb +38 -38
- metadata +31 -11
data/lib/factbook/sanitizer.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -37,52 +36,24 @@ def sanitize( html_ascii )
|
|
37
36
|
page_info.last_updated = find_page_last_updated( html_ascii )
|
38
37
|
|
39
38
|
|
40
|
-
|
39
|
+
html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
41
40
|
|
42
41
|
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
43
|
-
html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
42
|
+
# html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
44
43
|
|
45
|
-
html = sanitize_profile( html )
|
44
|
+
# html = sanitize_profile( html )
|
46
45
|
|
47
|
-
[html, page_info,
|
46
|
+
[html, page_info, []]
|
48
47
|
end
|
49
48
|
|
50
49
|
|
51
|
-
|
52
|
-
|
53
|
-
#
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
class="expandcollapse">
|
59
|
-
/xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
|
60
|
-
|
61
|
-
|
62
|
-
####
|
63
|
-
# example match:
|
64
|
-
#
|
65
|
-
# ++ 2016 Nov/3:
|
66
|
-
#
|
67
|
-
# </li>
|
68
|
-
# </ul>
|
69
|
-
# <!-- end generated content -->
|
70
|
-
#
|
71
|
-
### todo: just use first match of </li></ul> - why? why not?
|
72
|
-
#
|
73
|
-
# history/changes:
|
74
|
-
# ++ 2015 Sept/24 (for regex see attic):
|
75
|
-
#
|
76
|
-
# </li>
|
77
|
-
# </ul>
|
78
|
-
# </tbody></table>
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
END_FACTS_REGEX = /<\/li> \s*
|
83
|
-
<\/ul> \s*
|
84
|
-
<!-- \s end \s generated \s content \s -->
|
85
|
-
/xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
|
50
|
+
#
|
51
|
+
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
52
|
+
#
|
53
|
+
# remove aria labels
|
54
|
+
ARIA_ATTR_REGEX = /\s*
|
55
|
+
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
56
|
+
/xim ## do NOT allow multi-line - why? why not?
|
86
57
|
|
87
58
|
|
88
59
|
def find_country_profile( html )
|
@@ -90,273 +61,115 @@ def find_country_profile( html )
|
|
90
61
|
## remove header (everything before)
|
91
62
|
## <ul class="expandcollapse">
|
92
63
|
|
93
|
-
|
94
|
-
fail "*** no begin facts marker found for page" if pos.nil?
|
64
|
+
doc = Nokogiri::HTML( html )
|
95
65
|
|
96
|
-
|
97
|
-
html = html[pos..-1]
|
66
|
+
ul = doc.css( 'ul.expandcollapse' )[0]
|
98
67
|
|
99
|
-
|
68
|
+
puts ul.to_html[0..100]
|
100
69
|
|
101
|
-
###
|
102
|
-
## remove footer
|
103
|
-
## assume everthings after (last list item in unorder list inside a table body)
|
104
|
-
## </li>
|
105
|
-
## </ul>
|
106
|
-
## </tbody></table>
|
107
70
|
|
108
|
-
pos = html.index( END_FACTS_REGEX )
|
109
|
-
fail "*** no end facts marker found for page" if pos.nil?
|
110
71
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
72
|
+
## note: special case cc uses h2 instead of div block
|
73
|
+
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
74
|
+
## style="border-bottom: 2px solid white; cursor: pointer;">
|
75
|
+
## Introduction :: <span class="region">CURACAO </span>
|
76
|
+
## </h2>
|
77
|
+
## is old format !!!!
|
78
|
+
## cc - CURACAO
|
79
|
+
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
80
|
+
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
81
|
+
## wait for new version to be generated / pushed!!!
|
117
82
|
|
83
|
+
## check for old format if h2 are present
|
84
|
+
h2s = ul.css( 'h2' )
|
85
|
+
if h2s.size > 0
|
86
|
+
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
87
|
+
## return empty html string - why? why not?
|
88
|
+
return ''
|
89
|
+
end
|
118
90
|
|
119
91
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
##
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
<\/span>
|
145
|
-
\s*
|
146
|
-
<span \s class='category_data'[^>]*>
|
147
|
-
\s*
|
148
|
-
<a \s [^>]+>
|
149
|
-
.+?
|
150
|
-
<\/a>
|
151
|
-
\s*
|
152
|
-
<\/span>
|
153
|
-
<\/div>
|
154
|
-
/xim
|
155
|
-
|
156
|
-
##
|
157
|
-
## <div class='wrap'>
|
158
|
-
## <div class='audio-player'>
|
159
|
-
## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
|
160
|
-
## </audio>
|
161
|
-
## </div></div>
|
162
|
-
|
163
|
-
|
164
|
-
AUDIO_PLAYER_REGEX = /
|
165
|
-
<div \s class='wrap'>
|
166
|
-
<div \s class='audio-player'>
|
167
|
-
<audio \s [^>]+>
|
168
|
-
<\/audio>
|
169
|
-
<\/div>
|
170
|
-
<\/div>
|
171
|
-
/xim
|
172
|
-
|
173
|
-
|
174
|
-
## remove category => Area comparison map:
|
175
|
-
##
|
176
|
-
## <div class='disTable areaComp'
|
177
|
-
## ...
|
178
|
-
## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
|
179
|
-
|
180
|
-
AREA_COMP_CATEGORY_REGEX = /
|
181
|
-
<div \s class='disTable \s areaComp'
|
182
|
-
.+?
|
183
|
-
(?=<div \s id='field')
|
184
|
-
/xim
|
185
|
-
|
186
|
-
|
187
|
-
## remove category => population pyramid:
|
188
|
-
##
|
189
|
-
## <div class='disTable popPyramid'>
|
190
|
-
## ...
|
191
|
-
## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
|
192
|
-
|
193
|
-
POP_PYRAMID_CATEGORY_REGEX = /
|
194
|
-
<div \s class='disTable \s popPyramid'
|
195
|
-
.+?
|
196
|
-
(?=<div \s id='field')
|
197
|
-
/xim
|
198
|
-
|
199
|
-
## remove category => religious affiliation:
|
200
|
-
##
|
201
|
-
## <div class='disTable relAffiliation'>
|
202
|
-
## ...
|
203
|
-
## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
|
204
|
-
|
205
|
-
REL_AFFILIATION_CATEGORY_REGEX = /
|
206
|
-
<div \s class='disTable \s relAffiliation'
|
207
|
-
.+?
|
208
|
-
(?=<div \s id='field')
|
209
|
-
/xim
|
210
|
-
|
211
|
-
|
212
|
-
##########################################
|
213
|
-
## transforms / simplify
|
214
|
-
##
|
215
|
-
## <h2 sectiontitle='Introduction' ccode='ag'>
|
216
|
-
## Introduction :: <span class='region'>ALGERIA </span>
|
217
|
-
## </h2>
|
218
|
-
## becomes =>
|
219
|
-
## <h2>Introduction</h2>
|
220
|
-
##
|
221
|
-
## todo/fix: use named capture in future e.g.
|
222
|
-
## (?<text>.+?) instead of (.+?)
|
223
|
-
## not working for now w/ gsub (just passed in match string NOT match data)
|
224
|
-
|
225
|
-
CLEANUP_SECTION_REGEX = /
|
226
|
-
<h2 [^>]*>
|
227
|
-
(.+?)
|
228
|
-
<\/h2>
|
229
|
-
/xim
|
230
|
-
|
231
|
-
##
|
232
|
-
## <div id='field' class='category'>Electricity - consumption:</div>
|
233
|
-
## becomes =>
|
234
|
-
## <h3>Electricity - consumption:</h3>
|
235
|
-
|
236
|
-
CLEANUP_SUBSECTION_REGEX = /
|
237
|
-
<div \s id='field' [^>]*>
|
238
|
-
(.+?)
|
239
|
-
<\/div>
|
240
|
-
/xim
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
def sanitize_profile( html )
|
245
|
-
|
246
|
-
## remove categories w/ visualizations/graphics only e.g.
|
247
|
-
## - area comparions map
|
248
|
-
## - population pyramid
|
249
|
-
## - religious affiliation
|
250
|
-
|
251
|
-
html = html.gsub( AREA_COMP_CATEGORY_REGEX ) do |m|
|
252
|
-
puts "remove category => area comparison map:"
|
253
|
-
puts "#{m}"
|
254
|
-
''
|
255
|
-
end
|
256
|
-
|
257
|
-
html = html.gsub( POP_PYRAMID_CATEGORY_REGEX ) do |m|
|
258
|
-
puts "remove category => population pyramid:"
|
259
|
-
puts "#{m}"
|
260
|
-
''
|
261
|
-
end
|
262
|
-
|
263
|
-
html = html.gsub( REL_AFFILIATION_CATEGORY_REGEX ) do |m|
|
264
|
-
puts "remove category => religious affiliation:"
|
265
|
-
puts "#{m}"
|
266
|
-
''
|
267
|
-
end
|
268
|
-
|
269
|
-
################################################
|
270
|
-
## more - let's get started
|
271
|
-
|
272
|
-
html = html.gsub( STYLE_ATTR_REGEX ) do |m|
|
273
|
-
puts "remove style attr:"
|
274
|
-
puts "#{m}"
|
275
|
-
''
|
276
|
-
end
|
277
|
-
|
278
|
-
html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
|
279
|
-
puts "remove audio player:"
|
280
|
-
puts "#{m}"
|
281
|
-
''
|
282
|
-
end
|
283
|
-
|
284
|
-
html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
|
285
|
-
puts "remove country comparison:"
|
286
|
-
puts "#{m}"
|
287
|
-
''
|
288
|
-
end
|
289
|
-
|
290
|
-
## remove/cleanup anchors (a href)
|
291
|
-
html = html.gsub( /<a\s+[^>]+>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
|
292
|
-
puts " replace anchor (a) >#{$1}<"
|
293
|
-
|
294
|
-
inner_text = $1.dup ## keep a copy
|
295
|
-
if inner_text =~ /<img/ ## if includes image remove
|
296
|
-
puts " remove image in anchor"
|
297
|
-
''
|
298
|
-
else ## keep inner text
|
299
|
-
inner_text
|
92
|
+
###
|
93
|
+
## sanitize
|
94
|
+
|
95
|
+
## remove link items
|
96
|
+
## assume two <li>s are a section
|
97
|
+
|
98
|
+
html = String.new('')
|
99
|
+
|
100
|
+
## filter all li's
|
101
|
+
ul_children = ul.children.select { |el| if el.name == 'li'
|
102
|
+
true
|
103
|
+
else
|
104
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
105
|
+
false
|
106
|
+
end
|
107
|
+
}
|
108
|
+
puts " #{ul_children.size} li(s):"
|
109
|
+
ul_children.each_slice(2) do |lis|
|
110
|
+
li = lis[0]
|
111
|
+
div = li.at( 'div[sectiontitle]' )
|
112
|
+
if div.nil?
|
113
|
+
puts "!! ERROR: no section title found in div:"
|
114
|
+
puts li.to_html
|
115
|
+
exit 1
|
300
116
|
end
|
301
|
-
end
|
302
117
|
|
118
|
+
section_title = div['sectiontitle'].to_s
|
303
119
|
|
304
|
-
|
305
|
-
html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
|
306
|
-
puts " remove list >#{m}<"
|
307
|
-
''
|
308
|
-
end
|
120
|
+
html << "<h2>#{section_title}</h2>\n"
|
309
121
|
|
310
|
-
## clean-up class attrib e.g. remove unknown classes
|
311
|
-
html = html.gsub( CLASS_ATTR_REGEX ) do |m|
|
312
|
-
puts "cleanup class attr:"
|
313
|
-
puts "#{m}"
|
314
|
-
|
315
|
-
klasses = $2.split(' ')
|
316
|
-
klasses = klasses.select do |klass|
|
317
|
-
if ['category', 'category_data'].include?( klass )
|
318
|
-
true
|
319
|
-
else
|
320
|
-
puts " remove class #{klass}"
|
321
|
-
false
|
322
|
-
end
|
323
|
-
end
|
324
122
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
123
|
+
li = lis[1]
|
124
|
+
## filter all div's
|
125
|
+
li_children = li.children.select { |el| if el.name =='div'
|
126
|
+
true
|
127
|
+
else
|
128
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
129
|
+
false
|
130
|
+
end
|
131
|
+
}
|
132
|
+
puts " #{li_children.size} div(s):"
|
331
133
|
|
134
|
+
li_children.each_slice(2) do |divs|
|
135
|
+
div = divs[0]
|
136
|
+
a = div.css('a')[0]
|
332
137
|
|
333
|
-
|
334
|
-
|
138
|
+
if a
|
139
|
+
html << "\n<h3>#{a.text}:</h3>\n"
|
140
|
+
else
|
141
|
+
puts "!! WARN: no anchor found:"
|
142
|
+
puts div.to_html
|
143
|
+
end
|
335
144
|
|
336
|
-
html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
|
337
|
-
puts " cleanup section (h2) heading >#{$1}<"
|
338
145
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
text = text[0...pos]
|
344
|
-
end
|
345
|
-
text = text.strip # remove trailing space too
|
146
|
+
div = divs[1]
|
147
|
+
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
148
|
+
div_children.each do |catdiv|
|
149
|
+
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
346
150
|
|
347
|
-
|
151
|
+
if catdiv['class'].index( 'attachment' )
|
152
|
+
## skip attachments e.g. maps, pop pyramids, etc.
|
153
|
+
else
|
154
|
+
html << catdiv.to_html
|
155
|
+
html << "\n"
|
156
|
+
end
|
157
|
+
else
|
158
|
+
puts "!! WARN: skipping div (W/O category_data class):"
|
159
|
+
puts catdiv.to_html
|
160
|
+
end
|
161
|
+
end
|
348
162
|
end
|
163
|
+
end
|
349
164
|
|
350
|
-
html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
|
351
|
-
puts " cleanup subsection (h3) heading >#{$1}<"
|
352
|
-
|
353
|
-
text = $1
|
354
|
-
text = text.strip # remove trailing space too
|
355
165
|
|
356
|
-
|
357
|
-
|
166
|
+
html = html.gsub( ARIA_ATTR_REGEX ) do |m|
|
167
|
+
puts "remove aria-label attr:"
|
168
|
+
puts "#{m}"
|
169
|
+
''
|
170
|
+
end
|
358
171
|
|
359
|
-
|
172
|
+
html
|
360
173
|
end
|
361
174
|
|
362
175
|
|
data/lib/factbook/version.rb
CHANGED
@@ -1,22 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
1
|
+
|
2
|
+
module Factbook
|
3
|
+
|
4
|
+
MAJOR = 2
|
5
|
+
MINOR = 0
|
6
|
+
PATCH = 0
|
7
|
+
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
8
|
+
|
9
|
+
def self.version
|
10
|
+
VERSION
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.banner
|
14
|
+
"factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.root
|
18
|
+
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|