factbook 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gemtest ADDED
File without changes
data/Manifest.txt CHANGED
@@ -3,4 +3,14 @@ Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
5
  lib/factbook.rb
6
+ lib/factbook/page.rb
6
7
  lib/factbook/version.rb
8
+ test/data/countrytemplate_au.html
9
+ test/data/countrytemplate_be.html
10
+ test/data/countrytemplate_br.html
11
+ test/data/countrytemplate_mx.html
12
+ test/helper.rb
13
+ test/test_json.rb
14
+ test/test_page.rb
15
+ test/test_page_old.rb
16
+ test/test_strip.rb
data/README.md CHANGED
@@ -7,9 +7,30 @@
7
7
  * forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
8
8
 
9
9
 
10
+
11
+ ## What's the World Factbook?
12
+
13
+ The World Factbook published by the Central Intelligence Agency (CIA)
14
+ offers free country profiles in the public domain (that is, no copyright(s), no rights reserved).
15
+
16
+ - [1] [The World Factbook](https://www.cia.gov/library/publications/the-world-factbook/)
17
+ - [2] [Wikipedia Article: The World Factbook](http://en.wikipedia.org/wiki/The_World_Factbook)
18
+
19
+
10
20
  ## Usage
11
21
 
12
- TBD
22
+ ### Get page as a hash (that is, structured data e.g. nested key/values)
23
+
24
+ page = Factbook::Page.new( 'br' )
25
+ pp page.data # pretty print hash
26
+
27
+ ### Save to disk as JSON
28
+
29
+ page = Factbook::Page.new( 'br' )
30
+ File.open( 'br.json', 'w') do |f|
31
+ f.write( JSON.pretty_generate( page.data ) )
32
+ end
33
+
13
34
 
14
35
  ## Install
15
36
 
@@ -18,9 +39,10 @@ Just install the gem:
18
39
  $ gem install factbook
19
40
 
20
41
 
42
+
21
43
  ## Alternatives
22
44
 
23
- TBD
45
+ - [worldfactbook gem](https://github.com/sayem/worldfactbook) by sayem (aka Sayem Khan); fetches data from its own mirror, that is, rubyworldfactbook.com (last updated 2011?)
24
46
 
25
47
 
26
48
  ## License
data/Rakefile CHANGED
@@ -19,7 +19,8 @@ Hoe.spec 'factbook' do
19
19
 
20
20
  self.extra_deps = [
21
21
  ['logutils' ],
22
- ['fetcher']
22
+ ['fetcher'],
23
+ ['nokogiri']
23
24
  ]
24
25
 
25
26
  self.licenses = ['Public Domain']
@@ -0,0 +1,408 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Page
6
+
7
+ include LogUtils::Logging
8
+
9
+ ## standard version
10
+ ## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
11
+
12
+ ## -- use text (low-bandwidth) version
13
+ ## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
14
+ SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
15
+
16
+ def initialize( code )
17
+ @code = code
18
+ end
19
+
20
+ def doc
21
+ @doc ||= Nokogiri::HTML( html )
22
+ end
23
+
24
+ def data
25
+ if @data.nil?
26
+ titles = [
27
+ 'intro',
28
+ 'geo',
29
+ 'people',
30
+ 'govt',
31
+ 'econ',
32
+ 'energy',
33
+ 'comm',
34
+ 'trans',
35
+ 'military',
36
+ 'issues' ]
37
+
38
+ @data = {}
39
+
40
+ sects.each_with_index do |sect,i|
41
+ logger.debug "############################"
42
+ logger.debug "### stats sect #{i}:"
43
+
44
+ @data[ titles[i] ] = sect_to_hash( sect )
45
+ end
46
+ end
47
+ @data
48
+ end
49
+
50
+
51
+ def sects
52
+ ## split html into sections
53
+ ## to avoid errors w/ nested tags
54
+
55
+ divs = [
56
+ '<div id="CollapsiblePanel1_Intro"',
57
+ '<div id="CollapsiblePanel1_Geo"',
58
+ '<div id="CollapsiblePanel1_People"',
59
+ '<div id="CollapsiblePanel1_Govt"',
60
+ '<div id="CollapsiblePanel1_Econ"',
61
+ '<div id="CollapsiblePanel1_Energy"',
62
+ '<div id="CollapsiblePanel1_Comm"',
63
+ '<div id="CollapsiblePanel1_Trans"',
64
+ '<div id="CollapsiblePanel1_Military"',
65
+ '<div id="CollapsiblePanel1_Issues"' ]
66
+
67
+ if @sects.nil?
68
+ @sects = []
69
+
70
+ @pos = []
71
+ divs.each_with_index do |div,i|
72
+ p = html.index( div )
73
+ if p.nil?
74
+ ## issue error: if not found
75
+ puts "*** error: section not found -- #{div}"
76
+ else
77
+ puts " found section #{i} @ #{p}"
78
+ end
79
+
80
+ @pos << p
81
+ end
82
+ @pos << -1 ## note: last entry add -1 for until the end of document
83
+
84
+ divs.each_with_index do |div,i|
85
+ from = @pos[i]
86
+ to = @pos[i+1]
87
+ to -= 1 unless to == -1 ## note: sub one (-1) unless end-of-string (-1)
88
+
89
+ ## todo: check that from is smaller than to
90
+ puts " cut section #{i} [#{from}..#{to}]"
91
+ @sects << Nokogiri::HTML( html[ from..to ] )
92
+
93
+ if i==0 || i==1
94
+ # puts "debug sect #{i}:"
95
+ # puts ">>>|||#{html[ from..to ]}|||<<<"
96
+ end
97
+ end
98
+ end
99
+
100
+ @sects
101
+ end
102
+
103
+ def html=(html)
104
+ ## for debugging n testing
105
+ ## lets you set html (no need to fetch via net)
106
+ @html = html
107
+ end
108
+
109
+ def html
110
+ if @html.nil?
111
+ @html = fetch()
112
+
113
+ ### remove everything up to
114
+ ## <div id="countryInfo" style="display: none;">
115
+ ## remove everything starting w/ footer
116
+ ## remove head !!!
117
+ ## in body remove header n footer
118
+
119
+ ## remove inline script
120
+ @html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
121
+ puts "remove script:"
122
+ puts "#{m}"
123
+ ''
124
+ end
125
+
126
+ ## remove inline style
127
+ @html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
128
+ puts "remove style:"
129
+ puts "#{m}"
130
+ ''
131
+ end
132
+
133
+ ## remove link
134
+ link_regex = /<link[^>]+>/
135
+ @html = @html.gsub( link_regex ) do |m|
136
+ puts "remove link:"
137
+ puts "#{m}"
138
+ ''
139
+ end
140
+
141
+ div_country_info_regex = /<div id="countryInfo"\s*>/
142
+ ## remove everything before <div id="countryInfo" >
143
+ pos = @html.index( div_country_info_regex )
144
+ if pos # not nil, false
145
+ @html = @html[pos..-1]
146
+ end
147
+
148
+ ## remove country comparison
149
+ ## e.g. <span class="category" >country comparison to the world:</span>
150
+ ## <span class="category_data">
151
+ ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
152
+ ## 5
153
+ ## </a>
154
+ ## </span>
155
+
156
+ ##
157
+ ##
158
+ ## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
159
+ ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
160
+ ##
161
+
162
+ country_comparison_regex = /
163
+ <span \s class="category"[^>]*>
164
+ country \s comparison \s to \s the \s world:
165
+ <\/span>
166
+ \s*
167
+ <span \s class="category_data"[^>]*>
168
+ \s*
169
+ <a \s [^>]+>
170
+ .+?
171
+ <\/a>
172
+ \s*
173
+ <\/span>
174
+ /xm
175
+
176
+ @html = @html.gsub( country_comparison_regex ) do |m|
177
+ puts "remove country comparison:"
178
+ puts "#{m}"
179
+ ''
180
+ end
181
+
182
+ style_attr_regex = /\s*style="[^"]+"/
183
+ @html = @html.gsub( style_attr_regex ) do |m|
184
+ puts "remove style attr:"
185
+ puts "#{m}"
186
+ ''
187
+ end
188
+
189
+ ## <tr height="22">
190
+ ## <td class="category_data"></td>
191
+ ## </tr>
192
+ tr_empty_regex = /
193
+ <tr[^>]*>
194
+ \s*
195
+ <td[^>]*> \s* <\/td>
196
+ \s*
197
+ <\/tr>
198
+ /xm
199
+ @html = @html.gsub( tr_empty_regex ) do |m|
200
+ puts "remove tr emtpy:"
201
+ puts "#{m}"
202
+ ''
203
+ end
204
+
205
+ ## remove world leader website promo
206
+ ## <span class="category">(For more information visit the
207
+ ## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>&nbsp;
208
+ ## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
209
+ ## </span>
210
+ world_leaders_website_regex = /
211
+ <span \s class="category"[^>]*>
212
+ \(
213
+ For \s more \s information \s
214
+ .+? ## non-greedy (smallest possible match
215
+ \)
216
+ <\/span>
217
+ /xm
218
+ @html = @html.gsub( world_leaders_website_regex ) do |m|
219
+ puts "remove world leader website promo:"
220
+ puts "#{m}"
221
+ ''
222
+ end
223
+
224
+ end
225
+ @html
226
+ end
227
+
228
+ private
229
+ def fetch
230
+ uri_string = SITE_BASE.gsub( '{code}', @code )
231
+
232
+ worker = Fetcher::Worker.new
233
+ response = worker.get_response( uri_string )
234
+
235
+ if response.code == '200'
236
+ t = response.body
237
+ ###
238
+ # NB: Net::HTTP will NOT set encoding UTF-8 etc.
239
+ # will mostly be ASCII
240
+ # - try to change encoding to UTF-8 ourselves
241
+ logger.debug "t.encoding.name (before): #{t.encoding.name}"
242
+ #####
243
+ # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
244
+
245
+ ## NB:
246
+ # for now "hardcoded" to utf8 - what else can we do?
247
+ # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
248
+ t = t.force_encoding( Encoding::UTF_8 )
249
+ logger.debug "t.encoding.name (after): #{t.encoding.name}"
250
+ ## pp t
251
+ t
252
+ else
253
+ logger.error "fetch HTTP - #{response.code} #{response.message}"
254
+ nil
255
+ end
256
+ end
257
+
258
+
259
+ def cleanup_key( key )
260
+ ## to lower case
261
+ key = key.downcase
262
+ ## seaport(s) => seaports
263
+ key = key.gsub( '(s)', 's' )
264
+ key = key.gsub( ':', '' ) # trailing :
265
+ ## remove special chars ()-/,'
266
+ key = key.gsub( /[()\-\/,]'/, ' ')
267
+ key = key.strip
268
+ key = key.gsub( /[ ]+/, '_' )
269
+ key
270
+ end
271
+
272
+
273
+ def sect_to_hash( sect )
274
+
275
+ rows = sect.css( 'table tr' )
276
+ cells = sect.css( 'table tr td' )
277
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
278
+ data_ids = rows.css( '#data' )
279
+
280
+ logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
281
+
282
+ hash = {}
283
+ last_cat = nil
284
+
285
+ cells.each_with_index do |cell,i|
286
+ ## next if i > 14 ## skip after xx for debugging for now
287
+
288
+ # check if field or data id
289
+ # check for (nested) div#field in td
290
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
291
+
292
+ # check for td#data
293
+ has_data_id = cell['id'] == 'data' ? true : false
294
+
295
+ if has_field_id
296
+
297
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
298
+ if cats.size == 1
299
+ text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
300
+ last_cat = text
301
+ logger.debug " [#{i}] category: >>#{text}<<"
302
+ else
303
+ logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
304
+ logger.warn cell.to_s
305
+ end
306
+
307
+ elsif has_data_id
308
+
309
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
310
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
311
+ cats_div_data = cell.css( 'div.category_data' )
312
+ cats_span_data = cell.css( 'span.category_data' )
313
+
314
+ logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
315
+
316
+ pairs = []
317
+ last_pair = nil
318
+ last_pair_data_count = 0
319
+
320
+ ## loop over div blocks (might be .category or .category_data)
321
+ cell.children.each_with_index do |child,j|
322
+ unless child.element?
323
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
324
+ ## puts child.to_s
325
+ next
326
+ end
327
+ unless child.name == 'div'
328
+ logger.warn " **** !!! skipping non-div >#{child.name}<:"
329
+ logger.warn child.to_s
330
+ next
331
+ end
332
+
333
+ ### check if .category or .category_data
334
+ if child['class'] == 'category'
335
+
336
+ ## collect text for category; exclude element w/ class.category_data
337
+ text = ""
338
+ child.children.each do |subchild|
339
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
340
+ end
341
+ text = cleanup_key( text )
342
+
343
+ value = child.css('span.category_data').text.strip
344
+
345
+ logger.debug " -- category >>#{text}<<"
346
+
347
+ ## start new pair
348
+ last_pair = [ text, value ]
349
+ last_pair_data_count = 0
350
+ pairs << last_pair
351
+
352
+ elsif child['class'] == 'category_data'
353
+ logger.debug " -- category_data"
354
+
355
+ text = child.text.strip
356
+
357
+ if last_pair.nil?
358
+ ## assume its the very first entry; use implied/auto-created category
359
+ last_pair = [ 'text', '' ]
360
+ last_pair_data_count = 0
361
+ pairs << last_pair
362
+ end
363
+
364
+ ### first category_data element?
365
+ if last_pair_data_count == 0
366
+ if last_pair[1] == ''
367
+ last_pair[1] = text
368
+ else
369
+ last_pair[1] += " #{text}" ## append w/o separator
370
+ end
371
+ else
372
+ if last_cat == 'demographic_profile' ## special case (use space a sep)
373
+ last_pair[1] += " #{text}" ## append with separator
374
+ else
375
+ last_pair[1] += "; #{text}" ## append with separator
376
+ end
377
+ end
378
+ last_pair_data_count += 1
379
+
380
+ else
381
+ logger.warn " **** !!! skipping div w/o category or category_data class:"
382
+ logger.warn child.to_s
383
+ end
384
+ end
385
+
386
+ ## pp pairs
387
+
388
+ ## pairs to hash
389
+ pairs_hash = {}
390
+ pairs.each do |pair|
391
+ pairs_hash[ pair[0] ] = pair[1]
392
+ end
393
+
394
+ hash[ last_cat ] = pairs_hash
395
+
396
+ else
397
+ logger.warn "#### !!!! unknown cell type (no field or data id found):"
398
+ logger.warn cell.to_s
399
+ end
400
+ end # each cell
401
+
402
+ hash # return hash
403
+
404
+ end # method sect_to_hash
405
+
406
+ end # class Page
407
+
408
+ end # module Factbook
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Factbook
3
- VERSION = '0.0.1'
3
+ VERSION = '0.1.0'
4
4
  end
5
5
 
data/lib/factbook.rb CHANGED
@@ -6,6 +6,7 @@ require 'net/http'
6
6
  require 'uri'
7
7
  require 'cgi'
8
8
  require 'pp'
9
+ require 'json'
9
10
 
10
11
 
11
12
  ## 3rd party gems/libs
@@ -13,10 +14,13 @@ require 'pp'
13
14
 
14
15
  require 'logutils'
15
16
  require 'fetcher'
17
+ require 'nokogiri'
18
+
16
19
 
17
20
  # our own code
18
21
 
19
22
  require 'factbook/version' # let it always go first
23
+ require 'factbook/page'
20
24
 
21
25
 
22
26
  module Factbook