factbook 1.2.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -37,52 +36,24 @@ def sanitize( html_ascii )
37
36
  page_info.last_updated = find_page_last_updated( html_ascii )
38
37
 
39
38
 
40
- html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
39
+ html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
41
40
 
42
41
  ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
43
- html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
42
+ # html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
44
43
 
45
- html = sanitize_profile( html )
44
+ # html = sanitize_profile( html )
46
45
 
47
- [html, page_info, errors]
46
+ [html, page_info, []]
48
47
  end
49
48
 
50
49
 
51
-
52
- ####
53
- # example match:
54
- #
55
- # <ul class="expandcollapse">
56
-
57
- BEGIN_FACTS_REGEX = /<ul \s+
58
- class="expandcollapse">
59
- /xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
60
-
61
-
62
- ####
63
- # example match:
64
- #
65
- # ++ 2016 Nov/3:
66
- #
67
- # </li>
68
- # </ul>
69
- # <!-- end generated content -->
70
- #
71
- ### todo: just use first match of </li></ul> - why? why not?
72
- #
73
- # history/changes:
74
- # ++ 2015 Sept/24 (for regex see attic):
75
- #
76
- # </li>
77
- # </ul>
78
- # </tbody></table>
79
- #
80
-
81
-
82
- END_FACTS_REGEX = /<\/li> \s*
83
- <\/ul> \s*
84
- <!-- \s end \s generated \s content \s -->
85
- /xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
50
+ #
51
+ # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
52
+ #
53
+ # remove aria labels
54
+ ARIA_ATTR_REGEX = /\s*
55
+ aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
56
+ /xim ## do NOT allow multi-line - why? why not?
86
57
 
87
58
 
88
59
  def find_country_profile( html )
@@ -90,273 +61,115 @@ def find_country_profile( html )
90
61
  ## remove header (everything before)
91
62
  ## <ul class="expandcollapse">
92
63
 
93
- pos = html.index( BEGIN_FACTS_REGEX )
94
- fail "*** no begin facts marker found for page" if pos.nil?
64
+ doc = Nokogiri::HTML( html )
95
65
 
96
- puts " bingo - found BEGIN_FACTS on pos #{pos}"
97
- html = html[pos..-1]
66
+ ul = doc.css( 'ul.expandcollapse' )[0]
98
67
 
99
- pp html[0..100]
68
+ puts ul.to_html[0..100]
100
69
 
101
- ###
102
- ## remove footer
103
- ## assume everthings after (last list item in unorder list inside a table body)
104
- ## </li>
105
- ## </ul>
106
- ## </tbody></table>
107
70
 
108
- pos = html.index( END_FACTS_REGEX )
109
- fail "*** no end facts marker found for page" if pos.nil?
110
71
 
111
- puts " bingo - found END_FACTS on pos #{pos}"
112
- html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
113
-
114
- pp html[-200..-1]
115
- html
116
- end
72
+ ## note: special case cc uses h2 instead of div block
73
+ ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
74
+ ## style="border-bottom: 2px solid white; cursor: pointer;">
75
+ ## Introduction :: <span class="region">CURACAO </span>
76
+ ## </h2>
77
+ ## is old format !!!!
78
+ ## cc - CURACAO
79
+ ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
80
+ ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
81
+ ## wait for new version to be generated / pushed!!!
117
82
 
83
+ ## check for old format if h2 are present
84
+ h2s = ul.css( 'h2' )
85
+ if h2s.size > 0
86
+ puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
87
+ ## return empty html string - why? why not?
88
+ return ''
89
+ end
118
90
 
119
91
 
120
- STYLE_ATTR_REGEX = /\s*
121
- style=('|").+?\1 ## note: use non-greedy match e.g. .+?
122
- /xim ## do NOT allow multi-line - why? why not?
123
-
124
- CLASS_ATTR_REGEX = /\s*
125
- class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
126
- /xim ## do NOT allow multi-line - why? why not?
127
-
128
- ##
129
- ## <div>
130
- ## <span class='category'>country comparison to the world: </span>
131
- ## <span class='category_data'>[[191]]</span>
132
- ## </div>
133
- ##
134
- ## <span class='category'>country comparison to the world: </span>
135
- ## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
136
-
137
-
138
- ## todo: add enclosing div too!!!
139
-
140
- COUNTRY_COMPARISON_REGEX = /
141
- <div>
142
- <span \s class='category'[^>]*>
143
- country \s comparison \s to \s the \s world: \s*
144
- <\/span>
145
- \s*
146
- <span \s class='category_data'[^>]*>
147
- \s*
148
- <a \s [^>]+>
149
- .+?
150
- <\/a>
151
- \s*
152
- <\/span>
153
- <\/div>
154
- /xim
155
-
156
- ##
157
- ## <div class='wrap'>
158
- ## <div class='audio-player'>
159
- ## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
160
- ## </audio>
161
- ## </div></div>
162
-
163
-
164
- AUDIO_PLAYER_REGEX = /
165
- <div \s class='wrap'>
166
- <div \s class='audio-player'>
167
- <audio \s [^>]+>
168
- <\/audio>
169
- <\/div>
170
- <\/div>
171
- /xim
172
-
173
-
174
- ## remove category => Area comparison map:
175
- ##
176
- ## <div class='disTable areaComp'
177
- ## ...
178
- ## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
179
-
180
- AREA_COMP_CATEGORY_REGEX = /
181
- <div \s class='disTable \s areaComp'
182
- .+?
183
- (?=<div \s id='field')
184
- /xim
185
-
186
-
187
- ## remove category => population pyramid:
188
- ##
189
- ## <div class='disTable popPyramid'>
190
- ## ...
191
- ## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
192
-
193
- POP_PYRAMID_CATEGORY_REGEX = /
194
- <div \s class='disTable \s popPyramid'
195
- .+?
196
- (?=<div \s id='field')
197
- /xim
198
-
199
- ## remove category => religious affiliation:
200
- ##
201
- ## <div class='disTable relAffiliation'>
202
- ## ...
203
- ## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
204
-
205
- REL_AFFILIATION_CATEGORY_REGEX = /
206
- <div \s class='disTable \s relAffiliation'
207
- .+?
208
- (?=<div \s id='field')
209
- /xim
210
-
211
-
212
- ##########################################
213
- ## transforms / simplify
214
- ##
215
- ## <h2 sectiontitle='Introduction' ccode='ag'>
216
- ## Introduction :: <span class='region'>ALGERIA </span>
217
- ## </h2>
218
- ## becomes =>
219
- ## <h2>Introduction</h2>
220
- ##
221
- ## todo/fix: use named capture in future e.g.
222
- ## (?<text>.+?) instead of (.+?)
223
- ## not working for now w/ gsub (just passed in match string NOT match data)
224
-
225
- CLEANUP_SECTION_REGEX = /
226
- <h2 [^>]*>
227
- (.+?)
228
- <\/h2>
229
- /xim
230
-
231
- ##
232
- ## <div id='field' class='category'>Electricity - consumption:</div>
233
- ## becomes =>
234
- ## <h3>Electricity - consumption:</h3>
235
-
236
- CLEANUP_SUBSECTION_REGEX = /
237
- <div \s id='field' [^>]*>
238
- (.+?)
239
- <\/div>
240
- /xim
241
-
242
-
243
-
244
- def sanitize_profile( html )
245
-
246
- ## remove categories w/ visualizations/graphics only e.g.
247
- ## - area comparions map
248
- ## - population pyramid
249
- ## - religious affiliation
250
-
251
- html = html.gsub( AREA_COMP_CATEGORY_REGEX ) do |m|
252
- puts "remove category => area comparison map:"
253
- puts "#{m}"
254
- ''
255
- end
256
-
257
- html = html.gsub( POP_PYRAMID_CATEGORY_REGEX ) do |m|
258
- puts "remove category => population pyramid:"
259
- puts "#{m}"
260
- ''
261
- end
262
-
263
- html = html.gsub( REL_AFFILIATION_CATEGORY_REGEX ) do |m|
264
- puts "remove category => religious affiliation:"
265
- puts "#{m}"
266
- ''
267
- end
268
-
269
- ################################################
270
- ## more - let's get started
271
-
272
- html = html.gsub( STYLE_ATTR_REGEX ) do |m|
273
- puts "remove style attr:"
274
- puts "#{m}"
275
- ''
276
- end
277
-
278
- html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
279
- puts "remove audio player:"
280
- puts "#{m}"
281
- ''
282
- end
283
-
284
- html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
285
- puts "remove country comparison:"
286
- puts "#{m}"
287
- ''
288
- end
289
-
290
- ## remove/cleanup anchors (a href)
291
- html = html.gsub( /<a\s+[^>]+>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
292
- puts " replace anchor (a) >#{$1}<"
293
-
294
- inner_text = $1.dup ## keep a copy
295
- if inner_text =~ /<img/ ## if includes image remove
296
- puts " remove image in anchor"
297
- ''
298
- else ## keep inner text
299
- inner_text
92
+ ###
93
+ ## sanitize
94
+
95
+ ## remove link items
96
+ ## assume two <li>s are a section
97
+
98
+ html = String.new('')
99
+
100
+ ## filter all li's
101
+ ul_children = ul.children.select { |el| if el.name == 'li'
102
+ true
103
+ else
104
+ # puts "skipping #{el.name} >#{el.to_html}<"
105
+ false
106
+ end
107
+ }
108
+ puts " #{ul_children.size} li(s):"
109
+ ul_children.each_slice(2) do |lis|
110
+ li = lis[0]
111
+ div = li.at( 'div[sectiontitle]' )
112
+ if div.nil?
113
+ puts "!! ERROR: no section title found in div:"
114
+ puts li.to_html
115
+ exit 1
300
116
  end
301
- end
302
117
 
118
+ section_title = div['sectiontitle'].to_s
303
119
 
304
- ## remove all list e.g. ul/li
305
- html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
306
- puts " remove list >#{m}<"
307
- ''
308
- end
120
+ html << "<h2>#{section_title}</h2>\n"
309
121
 
310
- ## clean-up class attrib e.g. remove unknown classes
311
- html = html.gsub( CLASS_ATTR_REGEX ) do |m|
312
- puts "cleanup class attr:"
313
- puts "#{m}"
314
-
315
- klasses = $2.split(' ')
316
- klasses = klasses.select do |klass|
317
- if ['category', 'category_data'].include?( klass )
318
- true
319
- else
320
- puts " remove class #{klass}"
321
- false
322
- end
323
- end
324
122
 
325
- if klasses.size > 0
326
- " class='#{klasses.join(' ')}'" ## note: add leading space!!
327
- else
328
- '' ## remove class attrib completely
329
- end
330
- end
123
+ li = lis[1]
124
+ ## filter all div's
125
+ li_children = li.children.select { |el| if el.name =='div'
126
+ true
127
+ else
128
+ # puts "skipping #{el.name} >#{el.to_html}<"
129
+ false
130
+ end
131
+ }
132
+ puts " #{li_children.size} div(s):"
331
133
 
134
+ li_children.each_slice(2) do |divs|
135
+ div = divs[0]
136
+ a = div.css('a')[0]
332
137
 
333
- ##################################################################
334
- ## simplify/cleanup section and subsection headings
138
+ if a
139
+ html << "\n<h3>#{a.text}:</h3>\n"
140
+ else
141
+ puts "!! WARN: no anchor found:"
142
+ puts div.to_html
143
+ end
335
144
 
336
- html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
337
- puts " cleanup section (h2) heading >#{$1}<"
338
145
 
339
- text = $1
340
- pos = text.index( '::' )
341
- if pos ## if includes => :: <span> Region </span> -- cut off
342
- puts " remove :: region/country from heading"
343
- text = text[0...pos]
344
- end
345
- text = text.strip # remove trailing space too
146
+ div = divs[1]
147
+ div_children = div.children.select {|el| el.name == 'div' ? true : false }
148
+ div_children.each do |catdiv|
149
+ if catdiv['class'] && catdiv['class'].index( 'category_data' )
346
150
 
347
- "<h2>#{text}</h2>"
151
+ if catdiv['class'].index( 'attachment' )
152
+ ## skip attachments e.g. maps, pop pyramids, etc.
153
+ else
154
+ html << catdiv.to_html
155
+ html << "\n"
156
+ end
157
+ else
158
+ puts "!! WARN: skipping div (W/O category_data class):"
159
+ puts catdiv.to_html
160
+ end
161
+ end
348
162
  end
163
+ end
349
164
 
350
- html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
351
- puts " cleanup subsection (h3) heading >#{$1}<"
352
-
353
- text = $1
354
- text = text.strip # remove trailing space too
355
165
 
356
- "<h3>#{text}</h3>"
357
- end
166
+ html = html.gsub( ARIA_ATTR_REGEX ) do |m|
167
+ puts "remove aria-label attr:"
168
+ puts "#{m}"
169
+ ''
170
+ end
358
171
 
359
- html
172
+ html
360
173
  end
361
174
 
362
175
 
@@ -1,22 +1,21 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- MAJOR = 1
6
- MINOR = 2
7
- PATCH = 2
8
- VERSION = [MAJOR,MINOR,PATCH].join('.')
9
-
10
- def self.version
11
- VERSION
12
- end
13
-
14
- def self.banner
15
- "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
16
- end
17
-
18
- def self.root
19
- "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
20
- end
21
-
22
- end
1
+
2
+ module Factbook
3
+
4
+ MAJOR = 2
5
+ MINOR = 0
6
+ PATCH = 0
7
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
8
+
9
+ def self.version
10
+ VERSION
11
+ end
12
+
13
+ def self.banner
14
+ "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
15
+ end
16
+
17
+ def self.root
18
+ File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
19
+ end
20
+
21
+ end