factbook 1.2.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -37,52 +36,24 @@ def sanitize( html_ascii )
37
36
  page_info.last_updated = find_page_last_updated( html_ascii )
38
37
 
39
38
 
40
- html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
39
+ html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
41
40
 
42
41
  ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
43
- html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
42
+ # html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
44
43
 
45
- html = sanitize_profile( html )
44
+ # html = sanitize_profile( html )
46
45
 
47
- [html, page_info, errors]
46
+ [html, page_info, []]
48
47
  end
49
48
 
50
49
 
51
-
52
- ####
53
- # example match:
54
- #
55
- # <ul class="expandcollapse">
56
-
57
- BEGIN_FACTS_REGEX = /<ul \s+
58
- class="expandcollapse">
59
- /xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
60
-
61
-
62
- ####
63
- # example match:
64
- #
65
- # ++ 2016 Nov/3:
66
- #
67
- # </li>
68
- # </ul>
69
- # <!-- end generated content -->
70
- #
71
- ### todo: just use first match of </li></ul> - why? why not?
72
- #
73
- # history/changes:
74
- # ++ 2015 Sept/24 (for regex see attic):
75
- #
76
- # </li>
77
- # </ul>
78
- # </tbody></table>
79
- #
80
-
81
-
82
- END_FACTS_REGEX = /<\/li> \s*
83
- <\/ul> \s*
84
- <!-- \s end \s generated \s content \s -->
85
- /xim ## ignore case; multi-line; ignore space (use \s for space/newline!)
50
+ #
51
+ # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
52
+ #
53
+ # remove aria labels
54
+ ARIA_ATTR_REGEX = /\s*
55
+ aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
56
+ /xim ## do NOT allow multi-line - why? why not?
86
57
 
87
58
 
88
59
  def find_country_profile( html )
@@ -90,273 +61,115 @@ def find_country_profile( html )
90
61
  ## remove header (everything before)
91
62
  ## <ul class="expandcollapse">
92
63
 
93
- pos = html.index( BEGIN_FACTS_REGEX )
94
- fail "*** no begin facts marker found for page" if pos.nil?
64
+ doc = Nokogiri::HTML( html )
95
65
 
96
- puts " bingo - found BEGIN_FACTS on pos #{pos}"
97
- html = html[pos..-1]
66
+ ul = doc.css( 'ul.expandcollapse' )[0]
98
67
 
99
- pp html[0..100]
68
+ puts ul.to_html[0..100]
100
69
 
101
- ###
102
- ## remove footer
103
- ## assume everthings after (last list item in unorder list inside a table body)
104
- ## </li>
105
- ## </ul>
106
- ## </tbody></table>
107
70
 
108
- pos = html.index( END_FACTS_REGEX )
109
- fail "*** no end facts marker found for page" if pos.nil?
110
71
 
111
- puts " bingo - found END_FACTS on pos #{pos}"
112
- html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
113
-
114
- pp html[-200..-1]
115
- html
116
- end
72
+ ## note: special case cc uses h2 instead of div block
73
+ ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
74
+ ## style="border-bottom: 2px solid white; cursor: pointer;">
75
+ ## Introduction :: <span class="region">CURACAO </span>
76
+ ## </h2>
77
+ ## is old format !!!!
78
+ ## cc - CURACAO
79
+ ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
80
+ ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
81
+ ## wait for new version to be generated / pushed!!!
117
82
 
83
+ ## check for old format if h2 are present
84
+ h2s = ul.css( 'h2' )
85
+ if h2s.size > 0
86
+ puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
87
+ ## return empty html string - why? why not?
88
+ return ''
89
+ end
118
90
 
119
91
 
120
- STYLE_ATTR_REGEX = /\s*
121
- style=('|").+?\1 ## note: use non-greedy match e.g. .+?
122
- /xim ## do NOT allow multi-line - why? why not?
123
-
124
- CLASS_ATTR_REGEX = /\s*
125
- class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
126
- /xim ## do NOT allow multi-line - why? why not?
127
-
128
- ##
129
- ## <div>
130
- ## <span class='category'>country comparison to the world: </span>
131
- ## <span class='category_data'>[[191]]</span>
132
- ## </div>
133
- ##
134
- ## <span class='category'>country comparison to the world: </span>
135
- ## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
136
-
137
-
138
- ## todo: add enclosing div too!!!
139
-
140
- COUNTRY_COMPARISON_REGEX = /
141
- <div>
142
- <span \s class='category'[^>]*>
143
- country \s comparison \s to \s the \s world: \s*
144
- <\/span>
145
- \s*
146
- <span \s class='category_data'[^>]*>
147
- \s*
148
- <a \s [^>]+>
149
- .+?
150
- <\/a>
151
- \s*
152
- <\/span>
153
- <\/div>
154
- /xim
155
-
156
- ##
157
- ## <div class='wrap'>
158
- ## <div class='audio-player'>
159
- ## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
160
- ## </audio>
161
- ## </div></div>
162
-
163
-
164
- AUDIO_PLAYER_REGEX = /
165
- <div \s class='wrap'>
166
- <div \s class='audio-player'>
167
- <audio \s [^>]+>
168
- <\/audio>
169
- <\/div>
170
- <\/div>
171
- /xim
172
-
173
-
174
- ## remove category => Area comparison map:
175
- ##
176
- ## <div class='disTable areaComp'
177
- ## ...
178
- ## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
179
-
180
- AREA_COMP_CATEGORY_REGEX = /
181
- <div \s class='disTable \s areaComp'
182
- .+?
183
- (?=<div \s id='field')
184
- /xim
185
-
186
-
187
- ## remove category => population pyramid:
188
- ##
189
- ## <div class='disTable popPyramid'>
190
- ## ...
191
- ## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
192
-
193
- POP_PYRAMID_CATEGORY_REGEX = /
194
- <div \s class='disTable \s popPyramid'
195
- .+?
196
- (?=<div \s id='field')
197
- /xim
198
-
199
- ## remove category => religious affiliation:
200
- ##
201
- ## <div class='disTable relAffiliation'>
202
- ## ...
203
- ## until hitting: <div id='field' -- e.g. next category/field (use lookahead e.g. (?=))
204
-
205
- REL_AFFILIATION_CATEGORY_REGEX = /
206
- <div \s class='disTable \s relAffiliation'
207
- .+?
208
- (?=<div \s id='field')
209
- /xim
210
-
211
-
212
- ##########################################
213
- ## transforms / simplify
214
- ##
215
- ## <h2 sectiontitle='Introduction' ccode='ag'>
216
- ## Introduction :: <span class='region'>ALGERIA </span>
217
- ## </h2>
218
- ## becomes =>
219
- ## <h2>Introduction</h2>
220
- ##
221
- ## todo/fix: use named capture in future e.g.
222
- ## (?<text>.+?) instead of (.+?)
223
- ## not working for now w/ gsub (just passed in match string NOT match data)
224
-
225
- CLEANUP_SECTION_REGEX = /
226
- <h2 [^>]*>
227
- (.+?)
228
- <\/h2>
229
- /xim
230
-
231
- ##
232
- ## <div id='field' class='category'>Electricity - consumption:</div>
233
- ## becomes =>
234
- ## <h3>Electricity - consumption:</h3>
235
-
236
- CLEANUP_SUBSECTION_REGEX = /
237
- <div \s id='field' [^>]*>
238
- (.+?)
239
- <\/div>
240
- /xim
241
-
242
-
243
-
244
- def sanitize_profile( html )
245
-
246
- ## remove categories w/ visualizations/graphics only e.g.
247
- ## - area comparions map
248
- ## - population pyramid
249
- ## - religious affiliation
250
-
251
- html = html.gsub( AREA_COMP_CATEGORY_REGEX ) do |m|
252
- puts "remove category => area comparison map:"
253
- puts "#{m}"
254
- ''
255
- end
256
-
257
- html = html.gsub( POP_PYRAMID_CATEGORY_REGEX ) do |m|
258
- puts "remove category => population pyramid:"
259
- puts "#{m}"
260
- ''
261
- end
262
-
263
- html = html.gsub( REL_AFFILIATION_CATEGORY_REGEX ) do |m|
264
- puts "remove category => religious affiliation:"
265
- puts "#{m}"
266
- ''
267
- end
268
-
269
- ################################################
270
- ## more - let's get started
271
-
272
- html = html.gsub( STYLE_ATTR_REGEX ) do |m|
273
- puts "remove style attr:"
274
- puts "#{m}"
275
- ''
276
- end
277
-
278
- html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
279
- puts "remove audio player:"
280
- puts "#{m}"
281
- ''
282
- end
283
-
284
- html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
285
- puts "remove country comparison:"
286
- puts "#{m}"
287
- ''
288
- end
289
-
290
- ## remove/cleanup anchors (a href)
291
- html = html.gsub( /<a\s+[^>]+>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
292
- puts " replace anchor (a) >#{$1}<"
293
-
294
- inner_text = $1.dup ## keep a copy
295
- if inner_text =~ /<img/ ## if includes image remove
296
- puts " remove image in anchor"
297
- ''
298
- else ## keep inner text
299
- inner_text
92
+ ###
93
+ ## sanitize
94
+
95
+ ## remove link items
96
+ ## assume two <li>s are a section
97
+
98
+ html = String.new('')
99
+
100
+ ## filter all li's
101
+ ul_children = ul.children.select { |el| if el.name == 'li'
102
+ true
103
+ else
104
+ # puts "skipping #{el.name} >#{el.to_html}<"
105
+ false
106
+ end
107
+ }
108
+ puts " #{ul_children.size} li(s):"
109
+ ul_children.each_slice(2) do |lis|
110
+ li = lis[0]
111
+ div = li.at( 'div[sectiontitle]' )
112
+ if div.nil?
113
+ puts "!! ERROR: no section title found in div:"
114
+ puts li.to_html
115
+ exit 1
300
116
  end
301
- end
302
117
 
118
+ section_title = div['sectiontitle'].to_s
303
119
 
304
- ## remove all list e.g. ul/li
305
- html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
306
- puts " remove list >#{m}<"
307
- ''
308
- end
120
+ html << "<h2>#{section_title}</h2>\n"
309
121
 
310
- ## clean-up class attrib e.g. remove unknown classes
311
- html = html.gsub( CLASS_ATTR_REGEX ) do |m|
312
- puts "cleanup class attr:"
313
- puts "#{m}"
314
-
315
- klasses = $2.split(' ')
316
- klasses = klasses.select do |klass|
317
- if ['category', 'category_data'].include?( klass )
318
- true
319
- else
320
- puts " remove class #{klass}"
321
- false
322
- end
323
- end
324
122
 
325
- if klasses.size > 0
326
- " class='#{klasses.join(' ')}'" ## note: add leading space!!
327
- else
328
- '' ## remove class attrib completely
329
- end
330
- end
123
+ li = lis[1]
124
+ ## filter all div's
125
+ li_children = li.children.select { |el| if el.name =='div'
126
+ true
127
+ else
128
+ # puts "skipping #{el.name} >#{el.to_html}<"
129
+ false
130
+ end
131
+ }
132
+ puts " #{li_children.size} div(s):"
331
133
 
134
+ li_children.each_slice(2) do |divs|
135
+ div = divs[0]
136
+ a = div.css('a')[0]
332
137
 
333
- ##################################################################
334
- ## simplify/cleanup section and subsection headings
138
+ if a
139
+ html << "\n<h3>#{a.text}:</h3>\n"
140
+ else
141
+ puts "!! WARN: no anchor found:"
142
+ puts div.to_html
143
+ end
335
144
 
336
- html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
337
- puts " cleanup section (h2) heading >#{$1}<"
338
145
 
339
- text = $1
340
- pos = text.index( '::' )
341
- if pos ## if includes => :: <span> Region </span> -- cut off
342
- puts " remove :: region/country from heading"
343
- text = text[0...pos]
344
- end
345
- text = text.strip # remove trailing space too
146
+ div = divs[1]
147
+ div_children = div.children.select {|el| el.name == 'div' ? true : false }
148
+ div_children.each do |catdiv|
149
+ if catdiv['class'] && catdiv['class'].index( 'category_data' )
346
150
 
347
- "<h2>#{text}</h2>"
151
+ if catdiv['class'].index( 'attachment' )
152
+ ## skip attachments e.g. maps, pop pyramids, etc.
153
+ else
154
+ html << catdiv.to_html
155
+ html << "\n"
156
+ end
157
+ else
158
+ puts "!! WARN: skipping div (W/O category_data class):"
159
+ puts catdiv.to_html
160
+ end
161
+ end
348
162
  end
163
+ end
349
164
 
350
- html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
351
- puts " cleanup subsection (h3) heading >#{$1}<"
352
-
353
- text = $1
354
- text = text.strip # remove trailing space too
355
165
 
356
- "<h3>#{text}</h3>"
357
- end
166
+ html = html.gsub( ARIA_ATTR_REGEX ) do |m|
167
+ puts "remove aria-label attr:"
168
+ puts "#{m}"
169
+ ''
170
+ end
358
171
 
359
- html
172
+ html
360
173
  end
361
174
 
362
175
 
@@ -1,22 +1,21 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- MAJOR = 1
6
- MINOR = 2
7
- PATCH = 2
8
- VERSION = [MAJOR,MINOR,PATCH].join('.')
9
-
10
- def self.version
11
- VERSION
12
- end
13
-
14
- def self.banner
15
- "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
16
- end
17
-
18
- def self.root
19
- "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
20
- end
21
-
22
- end
1
+
2
+ module Factbook
3
+
4
+ MAJOR = 2
5
+ MINOR = 0
6
+ PATCH = 0
7
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
8
+
9
+ def self.version
10
+ VERSION
11
+ end
12
+
13
+ def self.banner
14
+ "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
15
+ end
16
+
17
+ def self.root
18
+ File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
19
+ end
20
+
21
+ end