factbook 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gemtest ADDED
File without changes
data/Manifest.txt CHANGED
@@ -3,4 +3,14 @@ Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
5
  lib/factbook.rb
6
+ lib/factbook/page.rb
6
7
  lib/factbook/version.rb
8
+ test/data/countrytemplate_au.html
9
+ test/data/countrytemplate_be.html
10
+ test/data/countrytemplate_br.html
11
+ test/data/countrytemplate_mx.html
12
+ test/helper.rb
13
+ test/test_json.rb
14
+ test/test_page.rb
15
+ test/test_page_old.rb
16
+ test/test_strip.rb
data/README.md CHANGED
@@ -7,9 +7,30 @@
7
7
  * forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
8
8
 
9
9
 
10
+
11
+ ## What's the World Factbook?
12
+
13
+ The World Factbook published by the Central Intelligence Agency (CIA)
14
+ offers free country profiles in the public domain (that is, no copyright(s), no rights reserved).
15
+
16
+ - [1] [The World Factbook](https://www.cia.gov/library/publications/the-world-factbook/)
17
+ - [2] [Wikipedia Article: The World Factbook](http://en.wikipedia.org/wiki/The_World_Factbook)
18
+
19
+
10
20
  ## Usage
11
21
 
12
- TBD
22
+ ### Get page as a hash (that is, structured data e.g. nested key/values)
23
+
24
+ page = Factbook::Page.new( 'br' )
25
+ pp page.data # pretty print hash
26
+
27
+ ### Save to disk as JSON
28
+
29
+ page = Factbook::Page.new( 'br' )
30
+ File.open( 'br.json', 'w') do |f|
31
+ f.write( JSON.pretty_generate( page.data ) )
32
+ end
33
+
13
34
 
14
35
  ## Install
15
36
 
@@ -18,9 +39,10 @@ Just install the gem:
18
39
  $ gem install factbook
19
40
 
20
41
 
42
+
21
43
  ## Alternatives
22
44
 
23
- TBD
45
+ - [worldfactbook gem](https://github.com/sayem/worldfactbook) by sayem (aka Sayem Khan); fetches data from its own mirror, that is, rubyworldfactbook.com (last updated 2011?)
24
46
 
25
47
 
26
48
  ## License
data/Rakefile CHANGED
@@ -19,7 +19,8 @@ Hoe.spec 'factbook' do
19
19
 
20
20
  self.extra_deps = [
21
21
  ['logutils' ],
22
- ['fetcher']
22
+ ['fetcher'],
23
+ ['nokogiri']
23
24
  ]
24
25
 
25
26
  self.licenses = ['Public Domain']
@@ -0,0 +1,408 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Page
6
+
7
+ include LogUtils::Logging
8
+
9
+ ## standard version
10
+ ## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
11
+
12
+ ## -- use text (low-bandwidth) version
13
+ ## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
14
+ SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
15
+
16
+ def initialize( code )
17
+ @code = code
18
+ end
19
+
20
+ def doc
21
+ @doc ||= Nokogiri::HTML( html )
22
+ end
23
+
24
+ def data
25
+ if @data.nil?
26
+ titles = [
27
+ 'intro',
28
+ 'geo',
29
+ 'people',
30
+ 'govt',
31
+ 'econ',
32
+ 'energy',
33
+ 'comm',
34
+ 'trans',
35
+ 'military',
36
+ 'issues' ]
37
+
38
+ @data = {}
39
+
40
+ sects.each_with_index do |sect,i|
41
+ logger.debug "############################"
42
+ logger.debug "### stats sect #{i}:"
43
+
44
+ @data[ titles[i] ] = sect_to_hash( sect )
45
+ end
46
+ end
47
+ @data
48
+ end
49
+
50
+
51
+ def sects
52
+ ## split html into sections
53
+ ## to avoid errors w/ nested tags
54
+
55
+ divs = [
56
+ '<div id="CollapsiblePanel1_Intro"',
57
+ '<div id="CollapsiblePanel1_Geo"',
58
+ '<div id="CollapsiblePanel1_People"',
59
+ '<div id="CollapsiblePanel1_Govt"',
60
+ '<div id="CollapsiblePanel1_Econ"',
61
+ '<div id="CollapsiblePanel1_Energy"',
62
+ '<div id="CollapsiblePanel1_Comm"',
63
+ '<div id="CollapsiblePanel1_Trans"',
64
+ '<div id="CollapsiblePanel1_Military"',
65
+ '<div id="CollapsiblePanel1_Issues"' ]
66
+
67
+ if @sects.nil?
68
+ @sects = []
69
+
70
+ @pos = []
71
+ divs.each_with_index do |div,i|
72
+ p = html.index( div )
73
+ if p.nil?
74
+ ## issue error: if not found
75
+ puts "*** error: section not found -- #{div}"
76
+ else
77
+ puts " found section #{i} @ #{p}"
78
+ end
79
+
80
+ @pos << p
81
+ end
82
+ @pos << -1 ## note: last entry add -1 for until the end of document
83
+
84
+ divs.each_with_index do |div,i|
85
+ from = @pos[i]
86
+ to = @pos[i+1]
87
+ to -= 1 unless to == -1 ## note: sub one (-1) unless end-of-string (-1)
88
+
89
+ ## todo: check that from is smaller than to
90
+ puts " cut section #{i} [#{from}..#{to}]"
91
+ @sects << Nokogiri::HTML( html[ from..to ] )
92
+
93
+ if i==0 || i==1
94
+ # puts "debug sect #{i}:"
95
+ # puts ">>>|||#{html[ from..to ]}|||<<<"
96
+ end
97
+ end
98
+ end
99
+
100
+ @sects
101
+ end
102
+
103
+ def html=(html)
104
+ ## for debugging n testing
105
+ ## lets you set html (no need to fetch via net)
106
+ @html = html
107
+ end
108
+
109
+ def html
110
+ if @html.nil?
111
+ @html = fetch()
112
+
113
+ ### remove everything up to
114
+ ## <div id="countryInfo" style="display: none;">
115
+ ## remove everything starting w/ footer
116
+ ## remove head !!!
117
+ ## in body remove header n footer
118
+
119
+ ## remove inline script
120
+ @html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
121
+ puts "remove script:"
122
+ puts "#{m}"
123
+ ''
124
+ end
125
+
126
+ ## remove inline style
127
+ @html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
128
+ puts "remove style:"
129
+ puts "#{m}"
130
+ ''
131
+ end
132
+
133
+ ## remove link
134
+ link_regex = /<link[^>]+>/
135
+ @html = @html.gsub( link_regex ) do |m|
136
+ puts "remove link:"
137
+ puts "#{m}"
138
+ ''
139
+ end
140
+
141
+ div_country_info_regex = /<div id="countryInfo"\s*>/
142
+ ## remove everything before <div id="countryInfo" >
143
+ pos = @html.index( div_country_info_regex )
144
+ if pos # not nil, false
145
+ @html = @html[pos..-1]
146
+ end
147
+
148
+ ## remove country comparison
149
+ ## e.g. <span class="category" >country comparison to the world:</span>
150
+ ## <span class="category_data">
151
+ ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
152
+ ## 5
153
+ ## </a>
154
+ ## </span>
155
+
156
+ ##
157
+ ##
158
+ ## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
159
+ ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
160
+ ##
161
+
162
+ country_comparison_regex = /
163
+ <span \s class="category"[^>]*>
164
+ country \s comparison \s to \s the \s world:
165
+ <\/span>
166
+ \s*
167
+ <span \s class="category_data"[^>]*>
168
+ \s*
169
+ <a \s [^>]+>
170
+ .+?
171
+ <\/a>
172
+ \s*
173
+ <\/span>
174
+ /xm
175
+
176
+ @html = @html.gsub( country_comparison_regex ) do |m|
177
+ puts "remove country comparison:"
178
+ puts "#{m}"
179
+ ''
180
+ end
181
+
182
+ style_attr_regex = /\s*style="[^"]+"/
183
+ @html = @html.gsub( style_attr_regex ) do |m|
184
+ puts "remove style attr:"
185
+ puts "#{m}"
186
+ ''
187
+ end
188
+
189
+ ## <tr height="22">
190
+ ## <td class="category_data"></td>
191
+ ## </tr>
192
+ tr_empty_regex = /
193
+ <tr[^>]*>
194
+ \s*
195
+ <td[^>]*> \s* <\/td>
196
+ \s*
197
+ <\/tr>
198
+ /xm
199
+ @html = @html.gsub( tr_empty_regex ) do |m|
200
+ puts "remove tr emtpy:"
201
+ puts "#{m}"
202
+ ''
203
+ end
204
+
205
+ ## remove world leader website promo
206
+ ## <span class="category">(For more information visit the
207
+ ## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>&nbsp;
208
+ ## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
209
+ ## </span>
210
+ world_leaders_website_regex = /
211
+ <span \s class="category"[^>]*>
212
+ \(
213
+ For \s more \s information \s
214
+ .+? ## non-greedy (smallest possible match
215
+ \)
216
+ <\/span>
217
+ /xm
218
+ @html = @html.gsub( world_leaders_website_regex ) do |m|
219
+ puts "remove world leader website promo:"
220
+ puts "#{m}"
221
+ ''
222
+ end
223
+
224
+ end
225
+ @html
226
+ end
227
+
228
+ private
229
+ def fetch
230
+ uri_string = SITE_BASE.gsub( '{code}', @code )
231
+
232
+ worker = Fetcher::Worker.new
233
+ response = worker.get_response( uri_string )
234
+
235
+ if response.code == '200'
236
+ t = response.body
237
+ ###
238
+ # NB: Net::HTTP will NOT set encoding UTF-8 etc.
239
+ # will mostly be ASCII
240
+ # - try to change encoding to UTF-8 ourselves
241
+ logger.debug "t.encoding.name (before): #{t.encoding.name}"
242
+ #####
243
+ # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
244
+
245
+ ## NB:
246
+ # for now "hardcoded" to utf8 - what else can we do?
247
+ # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
248
+ t = t.force_encoding( Encoding::UTF_8 )
249
+ logger.debug "t.encoding.name (after): #{t.encoding.name}"
250
+ ## pp t
251
+ t
252
+ else
253
+ logger.error "fetch HTTP - #{response.code} #{response.message}"
254
+ nil
255
+ end
256
+ end
257
+
258
+
259
+ def cleanup_key( key )
260
+ ## to lower case
261
+ key = key.downcase
262
+ ## seaport(s) => seaports
263
+ key = key.gsub( '(s)', 's' )
264
+ key = key.gsub( ':', '' ) # trailing :
265
+ ## remove special chars ()-/,'
266
+ key = key.gsub( /[()\-\/,]'/, ' ')
267
+ key = key.strip
268
+ key = key.gsub( /[ ]+/, '_' )
269
+ key
270
+ end
271
+
272
+
273
+ def sect_to_hash( sect )
274
+
275
+ rows = sect.css( 'table tr' )
276
+ cells = sect.css( 'table tr td' )
277
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
278
+ data_ids = rows.css( '#data' )
279
+
280
+ logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
281
+
282
+ hash = {}
283
+ last_cat = nil
284
+
285
+ cells.each_with_index do |cell,i|
286
+ ## next if i > 14 ## skip after xx for debugging for now
287
+
288
+ # check if field or data id
289
+ # check for (nested) div#field in td
290
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
291
+
292
+ # check for td#data
293
+ has_data_id = cell['id'] == 'data' ? true : false
294
+
295
+ if has_field_id
296
+
297
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
298
+ if cats.size == 1
299
+ text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
300
+ last_cat = text
301
+ logger.debug " [#{i}] category: >>#{text}<<"
302
+ else
303
+ logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
304
+ logger.warn cell.to_s
305
+ end
306
+
307
+ elsif has_data_id
308
+
309
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
310
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
311
+ cats_div_data = cell.css( 'div.category_data' )
312
+ cats_span_data = cell.css( 'span.category_data' )
313
+
314
+ logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
315
+
316
+ pairs = []
317
+ last_pair = nil
318
+ last_pair_data_count = 0
319
+
320
+ ## loop over div blocks (might be .category or .category_data)
321
+ cell.children.each_with_index do |child,j|
322
+ unless child.element?
323
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
324
+ ## puts child.to_s
325
+ next
326
+ end
327
+ unless child.name == 'div'
328
+ logger.warn " **** !!! skipping non-div >#{child.name}<:"
329
+ logger.warn child.to_s
330
+ next
331
+ end
332
+
333
+ ### check if .category or .category_data
334
+ if child['class'] == 'category'
335
+
336
+ ## collect text for category; exclude element w/ class.category_data
337
+ text = ""
338
+ child.children.each do |subchild|
339
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
340
+ end
341
+ text = cleanup_key( text )
342
+
343
+ value = child.css('span.category_data').text.strip
344
+
345
+ logger.debug " -- category >>#{text}<<"
346
+
347
+ ## start new pair
348
+ last_pair = [ text, value ]
349
+ last_pair_data_count = 0
350
+ pairs << last_pair
351
+
352
+ elsif child['class'] == 'category_data'
353
+ logger.debug " -- category_data"
354
+
355
+ text = child.text.strip
356
+
357
+ if last_pair.nil?
358
+ ## assume its the very first entry; use implied/auto-created category
359
+ last_pair = [ 'text', '' ]
360
+ last_pair_data_count = 0
361
+ pairs << last_pair
362
+ end
363
+
364
+ ### first category_data element?
365
+ if last_pair_data_count == 0
366
+ if last_pair[1] == ''
367
+ last_pair[1] = text
368
+ else
369
+ last_pair[1] += " #{text}" ## append w/o separator
370
+ end
371
+ else
372
+ if last_cat == 'demographic_profile' ## special case (use space a sep)
373
+ last_pair[1] += " #{text}" ## append with separator
374
+ else
375
+ last_pair[1] += "; #{text}" ## append with separator
376
+ end
377
+ end
378
+ last_pair_data_count += 1
379
+
380
+ else
381
+ logger.warn " **** !!! skipping div w/o category or category_data class:"
382
+ logger.warn child.to_s
383
+ end
384
+ end
385
+
386
+ ## pp pairs
387
+
388
+ ## pairs to hash
389
+ pairs_hash = {}
390
+ pairs.each do |pair|
391
+ pairs_hash[ pair[0] ] = pair[1]
392
+ end
393
+
394
+ hash[ last_cat ] = pairs_hash
395
+
396
+ else
397
+ logger.warn "#### !!!! unknown cell type (no field or data id found):"
398
+ logger.warn cell.to_s
399
+ end
400
+ end # each cell
401
+
402
+ hash # return hash
403
+
404
+ end # method sect_to_hash
405
+
406
+ end # class Page
407
+
408
+ end # module Factbook
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Factbook
3
- VERSION = '0.0.1'
3
+ VERSION = '0.1.0'
4
4
  end
5
5
 
data/lib/factbook.rb CHANGED
@@ -6,6 +6,7 @@ require 'net/http'
6
6
  require 'uri'
7
7
  require 'cgi'
8
8
  require 'pp'
9
+ require 'json'
9
10
 
10
11
 
11
12
  ## 3rd party gems/libs
@@ -13,10 +14,13 @@ require 'pp'
13
14
 
14
15
  require 'logutils'
15
16
  require 'fetcher'
17
+ require 'nokogiri'
18
+
16
19
 
17
20
  # our own code
18
21
 
19
22
  require 'factbook/version' # let it always go first
23
+ require 'factbook/page'
20
24
 
21
25
 
22
26
  module Factbook